Upload folder using huggingface_hub
Browse files- attention_kindselective_n_heads4_seed1340/args.json +1 -1
- attention_kindselective_n_heads4_seed1340/dataloader_10000.pt +3 -0
- attention_kindselective_n_heads4_seed1340/dataloader_12500.pt +3 -0
- attention_kindselective_n_heads4_seed1340/dataloader_42500.pt +3 -0
- attention_kindselective_n_heads4_seed1340/dataloader_45000.pt +3 -0
- attention_kindselective_n_heads4_seed1340/dataloader_47500.pt +3 -0
- attention_kindselective_n_heads4_seed1340/dataloader_49999.pt +3 -0
- attention_kindselective_n_heads4_seed1340/log2.txt +1111 -590
- attention_kindselective_n_heads4_seed1340/model_02500.pt +1 -1
- attention_kindselective_n_heads4_seed1340/model_05000.pt +1 -1
- attention_kindselective_n_heads4_seed1340/model_07500.pt +1 -1
- attention_kindselective_n_heads4_seed1340/model_10000.pt +3 -0
- attention_kindselective_n_heads4_seed1340/model_12500.pt +3 -0
- attention_kindselective_n_heads4_seed1340/model_42500.pt +3 -0
- attention_kindselective_n_heads4_seed1340/model_45000.pt +3 -0
- attention_kindselective_n_heads4_seed1340/model_47500.pt +3 -0
- attention_kindselective_n_heads4_seed1340/model_49999.pt +3 -0
- attention_kindselective_n_heads4_seed1340/optimizer_02500.pt +1 -1
- attention_kindselective_n_heads4_seed1340/optimizer_05000.pt +1 -1
- attention_kindselective_n_heads4_seed1340/optimizer_07500.pt +1 -1
- attention_kindselective_n_heads4_seed1340/optimizer_10000.pt +3 -0
- attention_kindselective_n_heads4_seed1340/optimizer_12500.pt +3 -0
- attention_kindselective_n_heads4_seed1340/optimizer_42500.pt +3 -0
- attention_kindselective_n_heads4_seed1340/optimizer_45000.pt +3 -0
- attention_kindselective_n_heads4_seed1340/optimizer_47500.pt +3 -0
- attention_kindselective_n_heads4_seed1340/optimizer_49999.pt +3 -0
attention_kindselective_n_heads4_seed1340/args.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"hellaswag": true, "attention_kind": "selective", "log_dir": "
|
|
|
|
| 1 |
+
{"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_7/attention_kindselective_n_heads4_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 50000, "warmup_steps": 200, "group": "wider_is_better_7", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 7e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "7e-5_10240_4_1340", "n_embd": 256}
|
attention_kindselective_n_heads4_seed1340/dataloader_10000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3858f6c832feea78a674d8c5c384061cc7d4f22cddbd0a2be6de33bc91e2c72
|
| 3 |
+
size 964
|
attention_kindselective_n_heads4_seed1340/dataloader_12500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ab3779d33c2e0a7873fcd8c39402e44260740665950323ad1445480ec339965a
|
| 3 |
+
size 964
|
attention_kindselective_n_heads4_seed1340/dataloader_42500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf6d24c78d89100d146bce9f26be940db3d71092473d9b55db97d6b35531eac2
|
| 3 |
+
size 964
|
attention_kindselective_n_heads4_seed1340/dataloader_45000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22bb90b43d81f3da5454f91a70e1ed29aeb2f470a727ce38390ff8a5c4924889
|
| 3 |
+
size 964
|
attention_kindselective_n_heads4_seed1340/dataloader_47500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55507725e6988f190e4963078652fafa6b68e8d4f79221387612612babf3e1c1
|
| 3 |
+
size 964
|
attention_kindselective_n_heads4_seed1340/dataloader_49999.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47776cddb8021172f048a950b83f25b692cb340214b800ce3837c15ceb58907c
|
| 3 |
+
size 964
|
attention_kindselective_n_heads4_seed1340/log2.txt
CHANGED
|
@@ -1,592 +1,1113 @@
|
|
| 1 |
-
max_steps:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
0 val loss 11.2703
|
| 3 |
0 val perplexity 78458.0234
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
3100
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
3200 val
|
| 187 |
-
3200
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
3300
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
3500 val
|
| 220 |
-
3500
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
3600
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
max_steps: 50000
|
| 2 |
+
1100 val loss 7.3709
|
| 3 |
+
1100 val perplexity 1589.1416
|
| 4 |
+
1100 train 6.997883 (lr=9.9927e-05) (hash(x)=37984588)
|
| 5 |
+
40100 val loss 5.8536
|
| 6 |
+
40100 val perplexity 348.4711
|
| 7 |
+
40100 train 5.793257 (lr=9.2472e-06) (hash(x)=48035170)
|
| 8 |
+
1200 val loss 7.3289
|
| 9 |
+
1200 val perplexity 1523.7263
|
| 10 |
+
1200 train 7.504585 (lr=9.9910e-05) (hash(x)=56333817)
|
| 11 |
0 val loss 11.2703
|
| 12 |
0 val perplexity 78458.0234
|
| 13 |
+
1300 val loss 7.3412
|
| 14 |
+
1300 val perplexity 1542.5471
|
| 15 |
+
1300 train 7.483638 (lr=9.9892e-05) (hash(x)=53454056)
|
| 16 |
+
40200 val loss 5.8481
|
| 17 |
+
40200 val perplexity 346.5657
|
| 18 |
+
40200 train 5.675584 (lr=9.1646e-06) (hash(x)=46651322)
|
| 19 |
+
1400 val loss 7.2560
|
| 20 |
+
1400 val perplexity 1416.6073
|
| 21 |
+
1400 train 7.362457 (lr=9.9871e-05) (hash(x)=55284163)
|
| 22 |
+
40300 val loss 5.8508
|
| 23 |
+
40300 val perplexity 347.5022
|
| 24 |
+
40300 train 5.707502 (lr=9.0827e-06) (hash(x)=46378099)
|
| 25 |
+
1500 val loss 7.3036
|
| 26 |
+
1500 val perplexity 1485.6129
|
| 27 |
+
1500 train 7.158344 (lr=9.9849e-05) (hash(x)=48162598)
|
| 28 |
+
0 train 11.272942 (lr=3.5000e-07) (hash(x)=57791809)
|
| 29 |
+
1600 val loss 7.2907
|
| 30 |
+
1600 val perplexity 1466.6249
|
| 31 |
+
1600 train 7.350899 (lr=9.9825e-05) (hash(x)=54214535)
|
| 32 |
+
40400 val loss 5.8443
|
| 33 |
+
40400 val perplexity 345.2604
|
| 34 |
+
40400 train 5.696771 (lr=9.0015e-06) (hash(x)=46495438)
|
| 35 |
+
100 val loss 9.7259
|
| 36 |
+
100 val perplexity 16745.7324
|
| 37 |
+
100 train 9.735422 (lr=3.5350e-05) (hash(x)=48211824)
|
| 38 |
+
1700 val loss 7.2683
|
| 39 |
+
1700 val perplexity 1434.0831
|
| 40 |
+
1700 train 7.474521 (lr=9.9799e-05) (hash(x)=53525003)
|
| 41 |
+
200 val loss 8.1014
|
| 42 |
+
200 val perplexity 3298.9370
|
| 43 |
+
200 train 8.058278 (lr=7.0000e-05) (hash(x)=50375849)
|
| 44 |
+
40500 val loss 5.8388
|
| 45 |
+
40500 val perplexity 343.3772
|
| 46 |
+
40500 train 5.996032 (lr=8.9211e-06) (hash(x)=52059147)
|
| 47 |
+
1800 val loss 7.2337
|
| 48 |
+
1800 val perplexity 1385.3571
|
| 49 |
+
1800 train 7.303545 (lr=9.9771e-05) (hash(x)=51848994)
|
| 50 |
+
300 val loss 7.6442
|
| 51 |
+
300 val perplexity 2088.4370
|
| 52 |
+
300 train 7.951274 (lr=6.9999e-05) (hash(x)=57250808)
|
| 53 |
+
400 val loss 7.5686
|
| 54 |
+
400 val perplexity 1936.4075
|
| 55 |
+
400 train 8.221130 (lr=6.9997e-05) (hash(x)=62519858)
|
| 56 |
+
40600 val loss 5.8407
|
| 57 |
+
40600 val perplexity 344.0050
|
| 58 |
+
40600 train 5.996709 (lr=8.8414e-06) (hash(x)=54885045)
|
| 59 |
+
1900 val loss 7.2157
|
| 60 |
+
1900 val perplexity 1360.6837
|
| 61 |
+
1900 train 7.053372 (lr=9.9741e-05) (hash(x)=48405987)
|
| 62 |
+
500 val loss 7.5147
|
| 63 |
+
500 val perplexity 1834.7661
|
| 64 |
+
500 train 7.395103 (lr=6.9994e-05) (hash(x)=47226806)
|
| 65 |
+
40700 val loss 5.8398
|
| 66 |
+
40700 val perplexity 343.7109
|
| 67 |
+
40700 train 6.230294 (lr=8.7624e-06) (hash(x)=53213971)
|
| 68 |
+
600 val loss 7.4733
|
| 69 |
+
600 val perplexity 1760.4728
|
| 70 |
+
600 train 7.511261 (lr=6.9990e-05) (hash(x)=51149322)
|
| 71 |
+
2000 val loss 7.1997
|
| 72 |
+
2000 val perplexity 1338.9938
|
| 73 |
+
2000 train 7.547995 (lr=9.9710e-05) (hash(x)=58592291)
|
| 74 |
+
700 val loss 7.4403
|
| 75 |
+
700 val perplexity 1703.2408
|
| 76 |
+
700 train 7.431039 (lr=6.9984e-05) (hash(x)=51564551)
|
| 77 |
+
40800 val loss 5.8359
|
| 78 |
+
40800 val perplexity 342.3692
|
| 79 |
+
40800 train 5.498599 (lr=8.6842e-06) (hash(x)=45133794)
|
| 80 |
+
2100 val loss 7.1929
|
| 81 |
+
2100 val perplexity 1329.9009
|
| 82 |
+
2100 train 7.272621 (lr=9.9677e-05) (hash(x)=51167081)
|
| 83 |
+
800 val loss 7.4083
|
| 84 |
+
800 val perplexity 1649.6486
|
| 85 |
+
800 train 7.160203 (lr=6.9977e-05) (hash(x)=45093459)
|
| 86 |
+
40900 val loss 5.8379
|
| 87 |
+
40900 val perplexity 343.0574
|
| 88 |
+
40900 train 5.988767 (lr=8.6068e-06) (hash(x)=56546547)
|
| 89 |
+
900 val loss 7.3864
|
| 90 |
+
900 val perplexity 1613.9653
|
| 91 |
+
900 train 7.685310 (lr=6.9969e-05) (hash(x)=54988361)
|
| 92 |
+
2200 val loss 7.1644
|
| 93 |
+
2200 val perplexity 1292.5675
|
| 94 |
+
2200 train 7.169672 (lr=9.9642e-05) (hash(x)=47994988)
|
| 95 |
+
1000 val loss 7.3699
|
| 96 |
+
1000 val perplexity 1587.5004
|
| 97 |
+
1000 train 7.238439 (lr=6.9960e-05) (hash(x)=47588648)
|
| 98 |
+
41000 val loss 5.8354
|
| 99 |
+
41000 val perplexity 342.1882
|
| 100 |
+
41000 train 6.221378 (lr=8.5301e-06) (hash(x)=49552270)
|
| 101 |
+
2300 val loss 7.1801
|
| 102 |
+
2300 val perplexity 1313.0338
|
| 103 |
+
2300 train 7.161005 (lr=9.9606e-05) (hash(x)=47377604)
|
| 104 |
+
1100 val loss 7.3479
|
| 105 |
+
1100 val perplexity 1552.9474
|
| 106 |
+
1100 train 6.955181 (lr=6.9949e-05) (hash(x)=37984588)
|
| 107 |
+
1200 val loss 7.3201
|
| 108 |
+
1200 val perplexity 1510.3381
|
| 109 |
+
1200 train 7.489902 (lr=6.9937e-05) (hash(x)=56333817)
|
| 110 |
+
41100 val loss 5.8354
|
| 111 |
+
41100 val perplexity 342.2155
|
| 112 |
+
41100 train 6.067726 (lr=8.4541e-06) (hash(x)=51222370)
|
| 113 |
+
2400 val loss 7.1777
|
| 114 |
+
2400 val perplexity 1309.8896
|
| 115 |
+
2400 train 7.223081 (lr=9.9567e-05) (hash(x)=53554323)
|
| 116 |
+
1300 val loss 7.3031
|
| 117 |
+
1300 val perplexity 1484.8395
|
| 118 |
+
1300 train 7.442085 (lr=6.9924e-05) (hash(x)=53454056)
|
| 119 |
+
41200 val loss 5.8369
|
| 120 |
+
41200 val perplexity 342.7026
|
| 121 |
+
41200 train 5.903095 (lr=8.3789e-06) (hash(x)=50883608)
|
| 122 |
+
2500 val loss 7.1743
|
| 123 |
+
2500 val perplexity 1305.3995
|
| 124 |
+
2500 train 7.199438 (lr=9.9527e-05) (hash(x)=50780417)
|
| 125 |
+
1400 val loss 7.2946
|
| 126 |
+
1400 val perplexity 1472.2584
|
| 127 |
+
1400 train 7.407144 (lr=6.9910e-05) (hash(x)=55284163)
|
| 128 |
+
1500 val loss 7.3019
|
| 129 |
+
1500 val perplexity 1483.1653
|
| 130 |
+
1500 train 7.143060 (lr=6.9894e-05) (hash(x)=48162598)
|
| 131 |
+
41300 val loss 5.8382
|
| 132 |
+
41300 val perplexity 343.1662
|
| 133 |
+
41300 train 6.058548 (lr=8.3045e-06) (hash(x)=52996828)
|
| 134 |
+
2600 val loss 7.1702
|
| 135 |
+
2600 val perplexity 1300.0759
|
| 136 |
+
2600 train 7.071150 (lr=9.9485e-05) (hash(x)=46453562)
|
| 137 |
+
1600 val loss 7.2579
|
| 138 |
+
1600 val perplexity 1419.3342
|
| 139 |
+
1600 train 7.321304 (lr=6.9877e-05) (hash(x)=54214535)
|
| 140 |
+
41400 val loss 5.8361
|
| 141 |
+
41400 val perplexity 342.4566
|
| 142 |
+
41400 train 5.811955 (lr=8.2308e-06) (hash(x)=48822716)
|
| 143 |
+
2700 val loss 7.2418
|
| 144 |
+
2700 val perplexity 1396.6613
|
| 145 |
+
2700 train 7.236431 (lr=9.9442e-05) (hash(x)=54404221)
|
| 146 |
+
1700 val loss 7.2608
|
| 147 |
+
1700 val perplexity 1423.4137
|
| 148 |
+
1700 train 7.450740 (lr=6.9859e-05) (hash(x)=53525003)
|
| 149 |
+
1800 val loss 7.2223
|
| 150 |
+
1800 val perplexity 1369.6931
|
| 151 |
+
1800 train 7.294693 (lr=6.9840e-05) (hash(x)=51848994)
|
| 152 |
+
41500 val loss 5.8375
|
| 153 |
+
41500 val perplexity 342.9267
|
| 154 |
+
41500 train 5.816633 (lr=8.1579e-06) (hash(x)=50936577)
|
| 155 |
+
2800 val loss 7.1606
|
| 156 |
+
2800 val perplexity 1287.7007
|
| 157 |
+
2800 train 7.852295 (lr=9.9396e-05) (hash(x)=59318895)
|
| 158 |
+
1900 val loss 7.2239
|
| 159 |
+
1900 val perplexity 1371.8162
|
| 160 |
+
1900 train 7.046358 (lr=6.9819e-05) (hash(x)=48405987)
|
| 161 |
+
41600 val loss 5.8377
|
| 162 |
+
41600 val perplexity 343.0029
|
| 163 |
+
41600 train 5.881047 (lr=8.0858e-06) (hash(x)=44375046)
|
| 164 |
+
2900 val loss 7.1619
|
| 165 |
+
2900 val perplexity 1289.3486
|
| 166 |
+
2900 train 7.149612 (lr=9.9349e-05) (hash(x)=47845760)
|
| 167 |
+
2000 val loss 7.2136
|
| 168 |
+
2000 val perplexity 1357.7904
|
| 169 |
+
2000 train 7.559543 (lr=6.9797e-05) (hash(x)=58592291)
|
| 170 |
+
2100 val loss 7.2009
|
| 171 |
+
2100 val perplexity 1340.6804
|
| 172 |
+
2100 train 7.274922 (lr=6.9774e-05) (hash(x)=51167081)
|
| 173 |
+
41700 val loss 5.8362
|
| 174 |
+
41700 val perplexity 342.4806
|
| 175 |
+
41700 train 5.599354 (lr=8.0144e-06) (hash(x)=44060021)
|
| 176 |
+
3000 val loss 7.1622
|
| 177 |
+
3000 val perplexity 1289.7507
|
| 178 |
+
3000 train 6.873025 (lr=9.9300e-05) (hash(x)=44336167)
|
| 179 |
+
2200 val loss 7.1883
|
| 180 |
+
2200 val perplexity 1323.8024
|
| 181 |
+
2200 train 7.174533 (lr=6.9750e-05) (hash(x)=47994988)
|
| 182 |
+
41800 val loss 5.8358
|
| 183 |
+
41800 val perplexity 342.3502
|
| 184 |
+
41800 train 6.128627 (lr=7.9438e-06) (hash(x)=57765221)
|
| 185 |
+
2300 val loss 7.2004
|
| 186 |
+
2300 val perplexity 1340.0284
|
| 187 |
+
3100 val loss 7.1277
|
| 188 |
+
3100 val perplexity 1246.0415
|
| 189 |
+
2300 train 7.185493 (lr=6.9724e-05) (hash(x)=47377604)
|
| 190 |
+
3100 train 7.390228 (lr=9.9249e-05) (hash(x)=44479330)
|
| 191 |
+
2400 val loss 7.1732
|
| 192 |
+
2400 val perplexity 1303.9562
|
| 193 |
+
2400 train 7.220889 (lr=6.9697e-05) (hash(x)=53554323)
|
| 194 |
+
3200 val loss 7.1646
|
| 195 |
+
3200 val perplexity 1292.8153
|
| 196 |
+
3200 train 7.241447 (lr=9.9197e-05) (hash(x)=54593096)
|
| 197 |
+
41900 val loss 5.8348
|
| 198 |
+
41900 val perplexity 341.9940
|
| 199 |
+
41900 train 5.597972 (lr=7.8740e-06) (hash(x)=46051470)
|
| 200 |
+
2500 val loss 7.1729
|
| 201 |
+
2500 val perplexity 1303.6392
|
| 202 |
+
2500 train 7.214406 (lr=6.9669e-05) (hash(x)=50780417)
|
| 203 |
+
3300 val loss 7.1778
|
| 204 |
+
3300 val perplexity 1309.9883
|
| 205 |
+
3300 train 7.088508 (lr=9.9142e-05) (hash(x)=45347643)
|
| 206 |
+
2600 val loss 7.1263
|
| 207 |
+
2600 val perplexity 1244.2609
|
| 208 |
+
2600 train 7.028562 (lr=6.9640e-05) (hash(x)=46453562)
|
| 209 |
+
42000 val loss 5.8415
|
| 210 |
+
42000 val perplexity 344.2894
|
| 211 |
+
42000 train 6.066402 (lr=7.8050e-06) (hash(x)=52077616)
|
| 212 |
+
2700 val loss 7.1115
|
| 213 |
+
2700 val perplexity 1225.9312
|
| 214 |
+
2700 train 7.080199 (lr=6.9609e-05) (hash(x)=54404221)
|
| 215 |
+
3400 val loss 7.1109
|
| 216 |
+
3400 val perplexity 1225.2228
|
| 217 |
+
3400 train 7.224405 (lr=9.9086e-05) (hash(x)=47797247)
|
| 218 |
+
42100 val loss 5.8336
|
| 219 |
+
42100 val perplexity 341.5976
|
| 220 |
+
42100 train 5.685880 (lr=7.7368e-06) (hash(x)=47845199)
|
| 221 |
+
2800 val loss 7.1089
|
| 222 |
+
2800 val perplexity 1222.8175
|
| 223 |
+
2800 train 7.750087 (lr=6.9577e-05) (hash(x)=59318895)
|
| 224 |
+
2900 val loss 7.1042
|
| 225 |
+
2900 val perplexity 1217.0992
|
| 226 |
+
2900 train 7.102781 (lr=6.9544e-05) (hash(x)=47845760)
|
| 227 |
+
3500 val loss 7.1129
|
| 228 |
+
3500 val perplexity 1227.7335
|
| 229 |
+
3500 train 7.017200 (lr=9.9028e-05) (hash(x)=46115683)
|
| 230 |
+
42200 val loss 5.8312
|
| 231 |
+
42200 val perplexity 340.7710
|
| 232 |
+
42200 train 6.088301 (lr=7.6693e-06) (hash(x)=51549823)
|
| 233 |
+
3000 val loss 7.0754
|
| 234 |
+
3000 val perplexity 1182.5112
|
| 235 |
+
3000 train 6.773137 (lr=6.9510e-05) (hash(x)=44336167)
|
| 236 |
+
3600 val loss 7.1055
|
| 237 |
+
3600 val perplexity 1218.6689
|
| 238 |
+
3600 train 6.937795 (lr=9.8969e-05) (hash(x)=44502074)
|
| 239 |
+
42300 val loss 5.8243
|
| 240 |
+
42300 val perplexity 338.4359
|
| 241 |
+
42300 train 5.965167 (lr=7.6027e-06) (hash(x)=56922131)
|
| 242 |
+
3100 val loss 7.0511
|
| 243 |
+
3100 val perplexity 1154.1840
|
| 244 |
+
3100 train 7.331172 (lr=6.9474e-05) (hash(x)=44479330)
|
| 245 |
+
3200 val loss 7.0365
|
| 246 |
+
3200 val perplexity 1137.3547
|
| 247 |
+
3200 train 7.123116 (lr=6.9438e-05) (hash(x)=54593096)
|
| 248 |
+
3700 val loss 7.0926
|
| 249 |
+
3700 val perplexity 1202.9988
|
| 250 |
+
3700 train 7.232045 (lr=9.8908e-05) (hash(x)=55388443)
|
| 251 |
+
42400 val loss 5.8262
|
| 252 |
+
42400 val perplexity 339.0629
|
| 253 |
+
42400 train 5.787869 (lr=7.5368e-06) (hash(x)=49004372)
|
| 254 |
+
3300 val loss 7.0396
|
| 255 |
+
3300 val perplexity 1140.9235
|
| 256 |
+
3300 train 6.961339 (lr=6.9400e-05) (hash(x)=45347643)
|
| 257 |
+
3800 val loss 7.0881
|
| 258 |
+
3800 val perplexity 1197.5861
|
| 259 |
+
3800 train 6.871030 (lr=9.8845e-05) (hash(x)=43790341)
|
| 260 |
+
42500 val loss 5.8224
|
| 261 |
+
42500 val perplexity 337.7739
|
| 262 |
+
42500 train 5.810077 (lr=7.4717e-06) (hash(x)=50651839)
|
| 263 |
+
3400 val loss 7.0278
|
| 264 |
+
3400 val perplexity 1127.5500
|
| 265 |
+
3400 train 7.154087 (lr=6.9360e-05) (hash(x)=47797247)
|
| 266 |
+
3500 val loss 7.0127
|
| 267 |
+
3500 val perplexity 1110.6456
|
| 268 |
+
3500 train 6.888947 (lr=6.9320e-05) (hash(x)=46115683)
|
| 269 |
+
3900 val loss 7.0927
|
| 270 |
+
3900 val perplexity 1203.1331
|
| 271 |
+
3900 train 7.089813 (lr=9.8780e-05) (hash(x)=50013318)
|
| 272 |
+
42600 val loss 5.8226
|
| 273 |
+
42600 val perplexity 337.8404
|
| 274 |
+
42600 train 6.146226 (lr=7.4074e-06) (hash(x)=50767721)
|
| 275 |
+
3600 val loss 6.9927
|
| 276 |
+
3600 val perplexity 1088.6067
|
| 277 |
+
3600 train 6.841239 (lr=6.9278e-05) (hash(x)=44502074)
|
| 278 |
+
4000 val loss 7.0888
|
| 279 |
+
4000 val perplexity 1198.4447
|
| 280 |
+
4000 train 7.173865 (lr=9.8713e-05) (hash(x)=51704787)
|
| 281 |
+
42700 val loss 5.8274
|
| 282 |
+
42700 val perplexity 339.4667
|
| 283 |
+
42700 train 5.559310 (lr=7.3440e-06) (hash(x)=49099183)
|
| 284 |
+
3700 val loss 6.9963
|
| 285 |
+
3700 val perplexity 1092.6323
|
| 286 |
+
3700 train 7.130051 (lr=6.9235e-05) (hash(x)=55388443)
|
| 287 |
+
3800 val loss 6.9835
|
| 288 |
+
3800 val perplexity 1078.6748
|
| 289 |
+
3800 train 6.767923 (lr=6.9191e-05) (hash(x)=43790341)
|
| 290 |
+
4100 val loss 7.0791
|
| 291 |
+
4100 val perplexity 1186.8921
|
| 292 |
+
4100 train 7.136350 (lr=9.8645e-05) (hash(x)=50821964)
|
| 293 |
+
42800 val loss 5.8205
|
| 294 |
+
42800 val perplexity 337.1293
|
| 295 |
+
42800 train 6.236982 (lr=7.2813e-06) (hash(x)=42272413)
|
| 296 |
+
3900 val loss 6.9793
|
| 297 |
+
3900 val perplexity 1074.1350
|
| 298 |
+
3900 train 6.971794 (lr=6.9146e-05) (hash(x)=50013318)
|
| 299 |
+
4200 val loss 7.1331
|
| 300 |
+
4200 val perplexity 1252.7737
|
| 301 |
+
4200 train 7.145490 (lr=9.8575e-05) (hash(x)=49675080)
|
| 302 |
+
42900 val loss 5.8239
|
| 303 |
+
42900 val perplexity 338.2816
|
| 304 |
+
42900 train 5.616502 (lr=7.2194e-06) (hash(x)=48582863)
|
| 305 |
+
4000 val loss 6.9700
|
| 306 |
+
4000 val perplexity 1064.2661
|
| 307 |
+
4000 train 7.038657 (lr=6.9099e-05) (hash(x)=51704787)
|
| 308 |
+
4100 val loss 6.9672
|
| 309 |
+
4100 val perplexity 1061.2119
|
| 310 |
+
4100 train 7.042075 (lr=6.9051e-05) (hash(x)=50821964)
|
| 311 |
+
4300 val loss 7.1120
|
| 312 |
+
4300 val perplexity 1226.5977
|
| 313 |
+
4300 train 6.766655 (lr=9.8503e-05) (hash(x)=43239281)
|
| 314 |
+
43000 val loss 5.8217
|
| 315 |
+
43000 val perplexity 337.5590
|
| 316 |
+
43000 train 5.796019 (lr=7.1583e-06) (hash(x)=48703446)
|
| 317 |
+
4200 val loss 6.9870
|
| 318 |
+
4200 val perplexity 1082.4434
|
| 319 |
+
4200 train 6.998270 (lr=6.9002e-05) (hash(x)=49675080)
|
| 320 |
+
4400 val loss 7.1208
|
| 321 |
+
4400 val perplexity 1237.4618
|
| 322 |
+
4400 train 6.820117 (lr=9.8430e-05) (hash(x)=45076737)
|
| 323 |
+
4300 val loss 6.9632
|
| 324 |
+
4300 val perplexity 1056.9971
|
| 325 |
+
4300 train 6.617874 (lr=6.8952e-05) (hash(x)=43239281)
|
| 326 |
+
43100 val loss 5.8203
|
| 327 |
+
43100 val perplexity 337.0722
|
| 328 |
+
43100 train 5.743421 (lr=7.0981e-06) (hash(x)=48730321)
|
| 329 |
+
4400 val loss 6.9505
|
| 330 |
+
4400 val perplexity 1043.6844
|
| 331 |
+
4400 train 6.636121 (lr=6.8901e-05) (hash(x)=45076737)
|
| 332 |
+
4500 val loss 7.1026
|
| 333 |
+
4500 val perplexity 1215.1410
|
| 334 |
+
4500 train 7.188966 (lr=9.8355e-05) (hash(x)=57930262)
|
| 335 |
+
43200 val loss 5.8184
|
| 336 |
+
43200 val perplexity 336.4272
|
| 337 |
+
43200 train 6.053661 (lr=7.0386e-06) (hash(x)=56536090)
|
| 338 |
+
4500 val loss 6.9386
|
| 339 |
+
4500 val perplexity 1031.2871
|
| 340 |
+
4500 train 7.046677 (lr=6.8848e-05) (hash(x)=57930262)
|
| 341 |
+
4600 val loss 7.0943
|
| 342 |
+
4600 val perplexity 1205.0604
|
| 343 |
+
4600 train 6.878264 (lr=9.8278e-05) (hash(x)=46721614)
|
| 344 |
+
4600 val loss 6.9329
|
| 345 |
+
4600 val perplexity 1025.4572
|
| 346 |
+
4600 train 6.712116 (lr=6.8794e-05) (hash(x)=46721614)
|
| 347 |
+
43300 val loss 5.8176
|
| 348 |
+
43300 val perplexity 336.1527
|
| 349 |
+
43300 train 5.962167 (lr=6.9800e-06) (hash(x)=54154116)
|
| 350 |
+
4700 val loss 6.9091
|
| 351 |
+
4700 val perplexity 1001.3547
|
| 352 |
+
4700 train 6.706449 (lr=6.8739e-05) (hash(x)=49837920)
|
| 353 |
+
4700 val loss 7.0600
|
| 354 |
+
4700 val perplexity 1164.4540
|
| 355 |
+
4700 train 6.883050 (lr=9.8199e-05) (hash(x)=49837920)
|
| 356 |
+
4800 val loss 6.8954
|
| 357 |
+
4800 val perplexity 987.7471
|
| 358 |
+
4800 train 7.025270 (lr=6.8683e-05) (hash(x)=48380045)
|
| 359 |
+
43400 val loss 5.8258
|
| 360 |
+
43400 val perplexity 338.9396
|
| 361 |
+
43400 train 5.723001 (lr=6.9222e-06) (hash(x)=50058055)
|
| 362 |
+
4800 val loss 7.0530
|
| 363 |
+
4800 val perplexity 1156.3136
|
| 364 |
+
4900 val loss 6.8825
|
| 365 |
+
4900 val perplexity 975.0323
|
| 366 |
+
4800 train 7.163837 (lr=9.8119e-05) (hash(x)=48380045)
|
| 367 |
+
4900 train 6.764085 (lr=6.8626e-05) (hash(x)=44202577)
|
| 368 |
+
43500 val loss 5.8194
|
| 369 |
+
43500 val perplexity 336.7863
|
| 370 |
+
43500 train 5.749609 (lr=6.8652e-06) (hash(x)=48743802)
|
| 371 |
+
5000 val loss 6.8525
|
| 372 |
+
5000 val perplexity 946.2589
|
| 373 |
+
5000 train 6.858996 (lr=6.8567e-05) (hash(x)=52038024)
|
| 374 |
+
4900 val loss 7.0500
|
| 375 |
+
4900 val perplexity 1152.8193
|
| 376 |
+
4900 train 6.937270 (lr=9.8036e-05) (hash(x)=44202577)
|
| 377 |
+
5100 val loss 6.8317
|
| 378 |
+
5100 val perplexity 926.8068
|
| 379 |
+
5100 train 7.012638 (lr=6.8507e-05) (hash(x)=53700038)
|
| 380 |
+
43600 val loss 5.8188
|
| 381 |
+
43600 val perplexity 336.5771
|
| 382 |
+
43600 train 5.458486 (lr=6.8090e-06) (hash(x)=42792886)
|
| 383 |
+
5200 val loss 6.8274
|
| 384 |
+
5200 val perplexity 922.8022
|
| 385 |
+
5200 train 6.814481 (lr=6.8446e-05) (hash(x)=48137625)
|
| 386 |
+
5000 val loss 7.0187
|
| 387 |
+
5000 val perplexity 1117.3700
|
| 388 |
+
5000 train 7.044736 (lr=9.7953e-05) (hash(x)=52038024)
|
| 389 |
+
43700 val loss 5.8163
|
| 390 |
+
43700 val perplexity 335.7397
|
| 391 |
+
43700 train 6.229331 (lr=6.7537e-06) (hash(x)=56446070)
|
| 392 |
+
5300 val loss 6.8135
|
| 393 |
+
5300 val perplexity 910.0942
|
| 394 |
+
5300 train 6.670180 (lr=6.8384e-05) (hash(x)=43161573)
|
| 395 |
+
5100 val loss 7.0031
|
| 396 |
+
5100 val perplexity 1100.0636
|
| 397 |
+
5100 train 7.189824 (lr=9.7867e-05) (hash(x)=53700038)
|
| 398 |
+
5400 val loss 6.8088
|
| 399 |
+
5400 val perplexity 905.7951
|
| 400 |
+
5400 train 7.007874 (lr=6.8320e-05) (hash(x)=56673322)
|
| 401 |
+
43800 val loss 5.8178
|
| 402 |
+
43800 val perplexity 336.2392
|
| 403 |
+
43800 train 5.653908 (lr=6.6992e-06) (hash(x)=45584354)
|
| 404 |
+
5500 val loss 6.8175
|
| 405 |
+
5500 val perplexity 913.7433
|
| 406 |
+
5500 train 6.971771 (lr=6.8256e-05) (hash(x)=53468295)
|
| 407 |
+
5200 val loss 6.9942
|
| 408 |
+
5200 val perplexity 1090.2700
|
| 409 |
+
5200 train 6.960814 (lr=9.7780e-05) (hash(x)=48137625)
|
| 410 |
+
43900 val loss 5.8161
|
| 411 |
+
43900 val perplexity 335.6672
|
| 412 |
+
43900 train 5.664525 (lr=6.6455e-06) (hash(x)=49339253)
|
| 413 |
+
5600 val loss 6.7675
|
| 414 |
+
5600 val perplexity 869.1533
|
| 415 |
+
5600 train 7.060785 (lr=6.8190e-05) (hash(x)=59287280)
|
| 416 |
+
5300 val loss 6.9828
|
| 417 |
+
5300 val perplexity 1077.8917
|
| 418 |
+
5300 train 6.854216 (lr=9.7691e-05) (hash(x)=43161573)
|
| 419 |
+
5700 val loss 6.7728
|
| 420 |
+
5700 val perplexity 873.7137
|
| 421 |
+
5700 train 7.033890 (lr=6.8123e-05) (hash(x)=57575806)
|
| 422 |
+
44000 val loss 5.8158
|
| 423 |
+
44000 val perplexity 335.5459
|
| 424 |
+
44000 train 5.635589 (lr=6.5926e-06) (hash(x)=46183203)
|
| 425 |
+
5800 val loss 6.7736
|
| 426 |
+
5800 val perplexity 874.4873
|
| 427 |
+
5800 train 6.723006 (lr=6.8055e-05) (hash(x)=46897279)
|
| 428 |
+
5400 val loss 6.9617
|
| 429 |
+
5400 val perplexity 1055.4479
|
| 430 |
+
5400 train 7.139781 (lr=9.7600e-05) (hash(x)=56673322)
|
| 431 |
+
44100 val loss 5.8135
|
| 432 |
+
44100 val perplexity 334.7792
|
| 433 |
+
44100 train 5.802389 (lr=6.5406e-06) (hash(x)=47849630)
|
| 434 |
+
5900 val loss 6.7703
|
| 435 |
+
5900 val perplexity 871.5712
|
| 436 |
+
5900 train 6.636249 (lr=6.7985e-05) (hash(x)=47565679)
|
| 437 |
+
5500 val loss 6.9375
|
| 438 |
+
5500 val perplexity 1030.2126
|
| 439 |
+
5500 train 7.112161 (lr=9.7508e-05) (hash(x)=53468295)
|
| 440 |
+
6000 val loss 6.7804
|
| 441 |
+
6000 val perplexity 880.4198
|
| 442 |
+
6000 train 6.648599 (lr=6.7915e-05) (hash(x)=51590090)
|
| 443 |
+
44200 val loss 5.8127
|
| 444 |
+
44200 val perplexity 334.5341
|
| 445 |
+
44200 train 6.146719 (lr=6.4894e-06) (hash(x)=49834275)
|
| 446 |
+
6100 val loss 6.7618
|
| 447 |
+
6100 val perplexity 864.2370
|
| 448 |
+
6100 train 7.137344 (lr=6.7843e-05) (hash(x)=59732271)
|
| 449 |
+
5600 val loss 6.9085
|
| 450 |
+
5600 val perplexity 1000.7457
|
| 451 |
+
5600 train 7.224924 (lr=9.7414e-05) (hash(x)=59287280)
|
| 452 |
+
44300 val loss 5.8095
|
| 453 |
+
44300 val perplexity 333.4532
|
| 454 |
+
44300 train 6.254009 (lr=6.4390e-06) (hash(x)=62535257)
|
| 455 |
+
6200 val loss 6.7642
|
| 456 |
+
6200 val perplexity 866.2368
|
| 457 |
+
6200 train 6.741118 (lr=6.7770e-05) (hash(x)=46394422)
|
| 458 |
+
5700 val loss 6.9043
|
| 459 |
+
5700 val perplexity 996.5828
|
| 460 |
+
5700 train 7.165204 (lr=9.7318e-05) (hash(x)=57575806)
|
| 461 |
+
6300 val loss 6.7599
|
| 462 |
+
6300 val perplexity 862.5737
|
| 463 |
+
6300 train 6.768298 (lr=6.7696e-05) (hash(x)=53748145)
|
| 464 |
+
44400 val loss 5.8078
|
| 465 |
+
44400 val perplexity 332.8983
|
| 466 |
+
44400 train 5.866293 (lr=6.3895e-06) (hash(x)=49253957)
|
| 467 |
+
6400 val loss 6.7398
|
| 468 |
+
6400 val perplexity 845.4134
|
| 469 |
+
6400 train 6.611226 (lr=6.7621e-05) (hash(x)=46054751)
|
| 470 |
+
5800 val loss 6.8937
|
| 471 |
+
5800 val perplexity 986.0107
|
| 472 |
+
5800 train 6.843620 (lr=9.7221e-05) (hash(x)=46897279)
|
| 473 |
+
6500 val loss 6.7447
|
| 474 |
+
6500 val perplexity 849.5231
|
| 475 |
+
6500 train 7.008954 (lr=6.7545e-05) (hash(x)=51816809)
|
| 476 |
+
44500 val loss 5.8086
|
| 477 |
+
44500 val perplexity 333.1573
|
| 478 |
+
44500 train 5.787062 (lr=6.3408e-06) (hash(x)=55368339)
|
| 479 |
+
5900 val loss 6.8839
|
| 480 |
+
5900 val perplexity 976.4327
|
| 481 |
+
5900 train 6.751657 (lr=9.7122e-05) (hash(x)=47565679)
|
| 482 |
+
6600 val loss 6.7294
|
| 483 |
+
6600 val perplexity 836.6632
|
| 484 |
+
6600 train 6.619945 (lr=6.7467e-05) (hash(x)=52453336)
|
| 485 |
+
44600 val loss 5.8085
|
| 486 |
+
44600 val perplexity 333.1111
|
| 487 |
+
44600 train 5.920068 (lr=6.2929e-06) (hash(x)=47098476)
|
| 488 |
+
6700 val loss 6.7275
|
| 489 |
+
6700 val perplexity 835.0275
|
| 490 |
+
6700 train 6.752491 (lr=6.7389e-05) (hash(x)=49108775)
|
| 491 |
+
6000 val loss 6.8816
|
| 492 |
+
6000 val perplexity 974.1851
|
| 493 |
+
6000 train 6.755291 (lr=9.7021e-05) (hash(x)=51590090)
|
| 494 |
+
6800 val loss 6.7240
|
| 495 |
+
6800 val perplexity 832.1128
|
| 496 |
+
6800 train 6.615003 (lr=6.7309e-05) (hash(x)=46745396)
|
| 497 |
+
44700 val loss 5.8119
|
| 498 |
+
44700 val perplexity 334.2415
|
| 499 |
+
44700 train 5.744551 (lr=6.2459e-06) (hash(x)=48280562)
|
| 500 |
+
6100 val loss 6.8433
|
| 501 |
+
6100 val perplexity 937.6213
|
| 502 |
+
6100 train 7.231963 (lr=9.6919e-05) (hash(x)=59732271)
|
| 503 |
+
6900 val loss 6.7129
|
| 504 |
+
6900 val perplexity 822.9876
|
| 505 |
+
6900 train 6.874073 (lr=6.7228e-05) (hash(x)=46534986)
|
| 506 |
+
44800 val loss 5.8111
|
| 507 |
+
44800 val perplexity 333.9803
|
| 508 |
+
44800 train 6.081460 (lr=6.1998e-06) (hash(x)=55591638)
|
| 509 |
+
7000 val loss 6.7105
|
| 510 |
+
7000 val perplexity 821.0043
|
| 511 |
+
7000 train 7.102075 (lr=6.7146e-05) (hash(x)=49317888)
|
| 512 |
+
6200 val loss 6.8340
|
| 513 |
+
6200 val perplexity 928.8552
|
| 514 |
+
6200 train 6.807478 (lr=9.6815e-05) (hash(x)=46394422)
|
| 515 |
+
7100 val loss 6.7149
|
| 516 |
+
7100 val perplexity 824.5883
|
| 517 |
+
7100 train 6.716461 (lr=6.7063e-05) (hash(x)=50360484)
|
| 518 |
+
44900 val loss 5.8106
|
| 519 |
+
44900 val perplexity 333.8185
|
| 520 |
+
44900 train 5.963002 (lr=6.1545e-06) (hash(x)=53757748)
|
| 521 |
+
6300 val loss 6.8275
|
| 522 |
+
6300 val perplexity 922.9268
|
| 523 |
+
6300 train 6.854854 (lr=9.6709e-05) (hash(x)=53748145)
|
| 524 |
+
7200 val loss 6.7289
|
| 525 |
+
7200 val perplexity 836.1858
|
| 526 |
+
7200 train 6.611360 (lr=6.6978e-05) (hash(x)=49515094)
|
| 527 |
+
45000 val loss 5.8083
|
| 528 |
+
45000 val perplexity 333.0545
|
| 529 |
+
45000 train 5.965748 (lr=6.1100e-06) (hash(x)=51685087)
|
| 530 |
+
7300 val loss 6.7329
|
| 531 |
+
7300 val perplexity 839.6050
|
| 532 |
+
7300 train 6.724388 (lr=6.6893e-05) (hash(x)=51546861)
|
| 533 |
+
6400 val loss 6.8335
|
| 534 |
+
6400 val perplexity 928.4501
|
| 535 |
+
6400 train 6.675262 (lr=9.6602e-05) (hash(x)=46054751)
|
| 536 |
+
7400 val loss 6.7554
|
| 537 |
+
7400 val perplexity 858.6415
|
| 538 |
+
7400 train 6.747338 (lr=6.6806e-05) (hash(x)=48320948)
|
| 539 |
+
45100 val loss 5.8066
|
| 540 |
+
45100 val perplexity 332.4754
|
| 541 |
+
45100 train 5.804672 (lr=6.0664e-06) (hash(x)=50093774)
|
| 542 |
+
6500 val loss 6.8217
|
| 543 |
+
6500 val perplexity 917.5349
|
| 544 |
+
6500 train 7.011214 (lr=9.6493e-05) (hash(x)=51816809)
|
| 545 |
+
7500 val loss 6.7581
|
| 546 |
+
7500 val perplexity 861.0138
|
| 547 |
+
7500 train 6.601406 (lr=6.6718e-05) (hash(x)=40167457)
|
| 548 |
+
45200 val loss 5.8079
|
| 549 |
+
45200 val perplexity 332.9253
|
| 550 |
+
45200 train 5.531459 (lr=6.0237e-06) (hash(x)=43460450)
|
| 551 |
+
7600 val loss 6.7487
|
| 552 |
+
7600 val perplexity 852.9794
|
| 553 |
+
7600 train 6.694093 (lr=6.6630e-05) (hash(x)=49942165)
|
| 554 |
+
6600 val loss 6.8168
|
| 555 |
+
6600 val perplexity 913.0578
|
| 556 |
+
6600 train 6.709125 (lr=9.6382e-05) (hash(x)=52453336)
|
| 557 |
+
7700 val loss 6.7365
|
| 558 |
+
7700 val perplexity 842.6336
|
| 559 |
+
7700 train 6.467313 (lr=6.6540e-05) (hash(x)=48853311)
|
| 560 |
+
45300 val loss 5.8075
|
| 561 |
+
45300 val perplexity 332.7967
|
| 562 |
+
45300 train 6.013202 (lr=5.9818e-06) (hash(x)=49935488)
|
| 563 |
+
6700 val loss 6.7775
|
| 564 |
+
6700 val perplexity 877.8773
|
| 565 |
+
6700 train 6.801557 (lr=9.6270e-05) (hash(x)=49108775)
|
| 566 |
+
7800 val loss 6.6977
|
| 567 |
+
7800 val perplexity 810.5665
|
| 568 |
+
7800 train 6.587886 (lr=6.6448e-05) (hash(x)=48510117)
|
| 569 |
+
7900 val loss 6.6830
|
| 570 |
+
7900 val perplexity 798.6885
|
| 571 |
+
7900 train 6.674704 (lr=6.6356e-05) (hash(x)=48339781)
|
| 572 |
+
45400 val loss 5.8084
|
| 573 |
+
45400 val perplexity 333.0822
|
| 574 |
+
45400 train 5.796069 (lr=5.9407e-06) (hash(x)=49447929)
|
| 575 |
+
6800 val loss 6.7620
|
| 576 |
+
6800 val perplexity 864.3433
|
| 577 |
+
6800 train 6.641977 (lr=9.6156e-05) (hash(x)=46745396)
|
| 578 |
+
8000 val loss 6.6692
|
| 579 |
+
8000 val perplexity 787.8011
|
| 580 |
+
8000 train 6.791812 (lr=6.6263e-05) (hash(x)=54927320)
|
| 581 |
+
45500 val loss 5.8073
|
| 582 |
+
45500 val perplexity 332.7321
|
| 583 |
+
45500 train 5.758606 (lr=5.9005e-06) (hash(x)=50713904)
|
| 584 |
+
8100 val loss 6.6685
|
| 585 |
+
8100 val perplexity 787.1943
|
| 586 |
+
8100 train 6.383808 (lr=6.6169e-05) (hash(x)=46461786)
|
| 587 |
+
6900 val loss 6.7623
|
| 588 |
+
6900 val perplexity 864.6273
|
| 589 |
+
6900 train 6.878799 (lr=9.6040e-05) (hash(x)=46534986)
|
| 590 |
+
8200 val loss 6.6542
|
| 591 |
+
8200 val perplexity 776.0621
|
| 592 |
+
8200 train 6.567985 (lr=6.6073e-05) (hash(x)=51536260)
|
| 593 |
+
45600 val loss 5.8281
|
| 594 |
+
45600 val perplexity 339.7057
|
| 595 |
+
45600 train 5.737741 (lr=5.8612e-06) (hash(x)=47674606)
|
| 596 |
+
7000 val loss 6.7450
|
| 597 |
+
7000 val perplexity 849.7970
|
| 598 |
+
7000 train 7.090025 (lr=9.5923e-05) (hash(x)=49317888)
|
| 599 |
+
8300 val loss 6.6445
|
| 600 |
+
8300 val perplexity 768.5745
|
| 601 |
+
8300 train 6.385070 (lr=6.5976e-05) (hash(x)=44770722)
|
| 602 |
+
45700 val loss 5.8069
|
| 603 |
+
45700 val perplexity 332.6016
|
| 604 |
+
45700 train 5.718114 (lr=5.8227e-06) (hash(x)=51539617)
|
| 605 |
+
8400 val loss 6.6291
|
| 606 |
+
8400 val perplexity 756.7707
|
| 607 |
+
8400 train 6.630779 (lr=6.5879e-05) (hash(x)=50104957)
|
| 608 |
+
7100 val loss 6.7150
|
| 609 |
+
7100 val perplexity 824.7158
|
| 610 |
+
7100 train 6.732314 (lr=9.5804e-05) (hash(x)=50360484)
|
| 611 |
+
8500 val loss 6.6186
|
| 612 |
+
8500 val perplexity 748.8944
|
| 613 |
+
8500 train 6.730096 (lr=6.5780e-05) (hash(x)=50132971)
|
| 614 |
+
45800 val loss 5.8064
|
| 615 |
+
45800 val perplexity 332.4121
|
| 616 |
+
45800 train 5.777891 (lr=5.7851e-06) (hash(x)=44448785)
|
| 617 |
+
7200 val loss 6.7163
|
| 618 |
+
7200 val perplexity 825.7302
|
| 619 |
+
7200 train 6.610737 (lr=9.5683e-05) (hash(x)=49515094)
|
| 620 |
+
8600 val loss 6.6052
|
| 621 |
+
8600 val perplexity 738.9441
|
| 622 |
+
8600 train 6.559481 (lr=6.5680e-05) (hash(x)=52193699)
|
| 623 |
+
45900 val loss 5.8067
|
| 624 |
+
45900 val perplexity 332.5286
|
| 625 |
+
45900 train 5.633348 (lr=5.7484e-06) (hash(x)=51499105)
|
| 626 |
+
8700 val loss 6.5946
|
| 627 |
+
8700 val perplexity 731.1027
|
| 628 |
+
8700 train 6.601634 (lr=6.5579e-05) (hash(x)=47902319)
|
| 629 |
+
7300 val loss 6.7274
|
| 630 |
+
7300 val perplexity 834.9750
|
| 631 |
+
7300 train 6.727911 (lr=9.5561e-05) (hash(x)=51546861)
|
| 632 |
+
8800 val loss 6.5905
|
| 633 |
+
8800 val perplexity 728.1722
|
| 634 |
+
8800 train 6.874655 (lr=6.5477e-05) (hash(x)=54904230)
|
| 635 |
+
46000 val loss 5.8019
|
| 636 |
+
46000 val perplexity 330.9422
|
| 637 |
+
46000 train 5.717524 (lr=5.7125e-06) (hash(x)=48359464)
|
| 638 |
+
8900 val loss 6.5658
|
| 639 |
+
8900 val perplexity 710.3594
|
| 640 |
+
7400 val loss 6.7125
|
| 641 |
+
7400 val perplexity 822.6318
|
| 642 |
+
8900 train 6.456809 (lr=6.5374e-05) (hash(x)=46311615)
|
| 643 |
+
7400 train 6.713262 (lr=9.5437e-05) (hash(x)=48320948)
|
| 644 |
+
46100 val loss 5.8034
|
| 645 |
+
46100 val perplexity 331.4209
|
| 646 |
+
46100 train 5.856997 (lr=5.6775e-06) (hash(x)=51885986)
|
| 647 |
+
9000 val loss 6.5762
|
| 648 |
+
9000 val perplexity 717.8407
|
| 649 |
+
9000 train 6.467047 (lr=6.5270e-05) (hash(x)=48535188)
|
| 650 |
+
7500 val loss 6.7149
|
| 651 |
+
7500 val perplexity 824.6104
|
| 652 |
+
7500 train 6.552715 (lr=9.5312e-05) (hash(x)=40167457)
|
| 653 |
+
9100 val loss 6.5482
|
| 654 |
+
9100 val perplexity 697.9618
|
| 655 |
+
9100 train 6.596004 (lr=6.5164e-05) (hash(x)=51757372)
|
| 656 |
+
46200 val loss 5.8014
|
| 657 |
+
46200 val perplexity 330.7549
|
| 658 |
+
46200 train 6.350838 (lr=5.6434e-06) (hash(x)=65186615)
|
| 659 |
+
9200 val loss 6.5645
|
| 660 |
+
9200 val perplexity 709.4841
|
| 661 |
+
9200 train 6.451322 (lr=6.5058e-05) (hash(x)=51131708)
|
| 662 |
+
7600 val loss 6.6876
|
| 663 |
+
7600 val perplexity 802.4150
|
| 664 |
+
7600 train 6.612953 (lr=9.5185e-05) (hash(x)=49942165)
|
| 665 |
+
9300 val loss 6.5492
|
| 666 |
+
9300 val perplexity 698.7104
|
| 667 |
+
9300 train 6.486413 (lr=6.4951e-05) (hash(x)=44784276)
|
| 668 |
+
46300 val loss 5.8012
|
| 669 |
+
46300 val perplexity 330.6895
|
| 670 |
+
46300 train 5.776049 (lr=5.6101e-06) (hash(x)=49626999)
|
| 671 |
+
7700 val loss 6.6817
|
| 672 |
+
7700 val perplexity 797.6483
|
| 673 |
+
7700 train 6.419989 (lr=9.5057e-05) (hash(x)=48853311)
|
| 674 |
+
9400 val loss 6.5449
|
| 675 |
+
9400 val perplexity 695.6748
|
| 676 |
+
9400 train 6.656165 (lr=6.4842e-05) (hash(x)=51981169)
|
| 677 |
+
46400 val loss 5.7988
|
| 678 |
+
46400 val perplexity 329.9163
|
| 679 |
+
46400 train 5.685073 (lr=5.5777e-06) (hash(x)=43325701)
|
| 680 |
+
9500 val loss 6.5243
|
| 681 |
+
9500 val perplexity 681.5016
|
| 682 |
+
9500 train 6.455208 (lr=6.4733e-05) (hash(x)=47232936)
|
| 683 |
+
7800 val loss 6.7101
|
| 684 |
+
7800 val perplexity 820.6340
|
| 685 |
+
7800 train 6.605823 (lr=9.4926e-05) (hash(x)=48510117)
|
| 686 |
+
9600 val loss 6.5283
|
| 687 |
+
9600 val perplexity 684.2013
|
| 688 |
+
9600 train 6.555980 (lr=6.4622e-05) (hash(x)=53800450)
|
| 689 |
+
46500 val loss 5.8014
|
| 690 |
+
46500 val perplexity 330.7682
|
| 691 |
+
46500 train 6.095033 (lr=5.5462e-06) (hash(x)=54028595)
|
| 692 |
+
7900 val loss 6.6982
|
| 693 |
+
7900 val perplexity 810.9125
|
| 694 |
+
7900 train 6.697021 (lr=9.4795e-05) (hash(x)=48339781)
|
| 695 |
+
9700 val loss 6.5110
|
| 696 |
+
9700 val perplexity 672.4990
|
| 697 |
+
9700 train 6.611804 (lr=6.4511e-05) (hash(x)=55768123)
|
| 698 |
+
46600 val loss 5.8004
|
| 699 |
+
46600 val perplexity 330.4477
|
| 700 |
+
46600 train 5.646887 (lr=5.5156e-06) (hash(x)=44519175)
|
| 701 |
+
9800 val loss 6.5201
|
| 702 |
+
9800 val perplexity 678.6589
|
| 703 |
+
9800 train 6.437235 (lr=6.4398e-05) (hash(x)=47745177)
|
| 704 |
+
8000 val loss 6.7015
|
| 705 |
+
8000 val perplexity 813.6622
|
| 706 |
+
8000 train 6.827817 (lr=9.4661e-05) (hash(x)=54927320)
|
| 707 |
+
46700 val loss 5.8020
|
| 708 |
+
46700 val perplexity 330.9630
|
| 709 |
+
46700 train 5.859873 (lr=5.4858e-06) (hash(x)=48357998)
|
| 710 |
+
9900 val loss 6.5059
|
| 711 |
+
9900 val perplexity 669.1097
|
| 712 |
+
9900 train 6.771001 (lr=6.4284e-05) (hash(x)=56592246)
|
| 713 |
+
8100 val loss 6.7125
|
| 714 |
+
8100 val perplexity 822.6165
|
| 715 |
+
8100 train 6.440390 (lr=9.4526e-05) (hash(x)=46461786)
|
| 716 |
+
10000 val loss 6.5050
|
| 717 |
+
10000 val perplexity 668.4611
|
| 718 |
+
10000 train 6.518945 (lr=6.4170e-05) (hash(x)=51655963)
|
| 719 |
+
46800 val loss 5.8007
|
| 720 |
+
46800 val perplexity 330.5271
|
| 721 |
+
46800 train 5.972873 (lr=5.4569e-06) (hash(x)=55911353)
|
| 722 |
+
10100 val loss 6.5228
|
| 723 |
+
10100 val perplexity 680.4901
|
| 724 |
+
10100 train 6.419588 (lr=6.4054e-05) (hash(x)=49809511)
|
| 725 |
+
8200 val loss 6.6683
|
| 726 |
+
8200 val perplexity 787.0919
|
| 727 |
+
8200 train 6.587510 (lr=9.4390e-05) (hash(x)=51536260)
|
| 728 |
+
46900 val loss 5.8022
|
| 729 |
+
46900 val perplexity 331.0330
|
| 730 |
+
46900 train 5.741434 (lr=5.4289e-06) (hash(x)=47897187)
|
| 731 |
+
10200 val loss 6.5020
|
| 732 |
+
10200 val perplexity 666.4471
|
| 733 |
+
10200 train 6.091907 (lr=6.3937e-05) (hash(x)=42297812)
|
| 734 |
+
8300 val loss 6.7038
|
| 735 |
+
8300 val perplexity 815.4675
|
| 736 |
+
8300 train 6.438375 (lr=9.4252e-05) (hash(x)=44770722)
|
| 737 |
+
10300 val loss 6.4973
|
| 738 |
+
10300 val perplexity 663.3268
|
| 739 |
+
10300 train 6.167974 (lr=6.3820e-05) (hash(x)=55529820)
|
| 740 |
+
47000 val loss 5.7987
|
| 741 |
+
47000 val perplexity 329.8616
|
| 742 |
+
47000 train 5.359689 (lr=5.4017e-06) (hash(x)=43196571)
|
| 743 |
+
10400 val loss 6.5084
|
| 744 |
+
10400 val perplexity 670.7539
|
| 745 |
+
10400 train 6.523131 (lr=6.3701e-05) (hash(x)=53255684)
|
| 746 |
+
8400 val loss 6.6674
|
| 747 |
+
8400 val perplexity 786.3558
|
| 748 |
+
8400 train 6.660437 (lr=9.4112e-05) (hash(x)=50104957)
|
| 749 |
+
10500 val loss 6.4665
|
| 750 |
+
10500 val perplexity 643.2431
|
| 751 |
+
10500 train 6.590051 (lr=6.3581e-05) (hash(x)=54306191)
|
| 752 |
+
47100 val loss 5.7992
|
| 753 |
+
47100 val perplexity 330.0356
|
| 754 |
+
47100 train 5.657062 (lr=5.3755e-06) (hash(x)=51224987)
|
| 755 |
+
8500 val loss 6.6906
|
| 756 |
+
8500 val perplexity 804.8203
|
| 757 |
+
8500 train 6.818645 (lr=9.3971e-05) (hash(x)=50132971)
|
| 758 |
+
10600 val loss 6.4734
|
| 759 |
+
10600 val perplexity 647.6887
|
| 760 |
+
10600 train 6.714907 (lr=6.3460e-05) (hash(x)=60130567)
|
| 761 |
+
47200 val loss 5.8016
|
| 762 |
+
47200 val perplexity 330.8251
|
| 763 |
+
47200 train 5.508888 (lr=5.3501e-06) (hash(x)=47943697)
|
| 764 |
+
10700 val loss 6.4934
|
| 765 |
+
10700 val perplexity 660.7467
|
| 766 |
+
10700 train 6.420184 (lr=6.3339e-05) (hash(x)=50074737)
|
| 767 |
+
8600 val loss 6.6548
|
| 768 |
+
8600 val perplexity 776.4918
|
| 769 |
+
8600 train 6.595765 (lr=9.3828e-05) (hash(x)=52193699)
|
| 770 |
+
10800 val loss 6.4459
|
| 771 |
+
10800 val perplexity 630.1255
|
| 772 |
+
10800 train 6.556093 (lr=6.3216e-05) (hash(x)=51547220)
|
| 773 |
+
47300 val loss 5.8012
|
| 774 |
+
47300 val perplexity 330.6942
|
| 775 |
+
47300 train 5.713035 (lr=5.3256e-06) (hash(x)=47351003)
|
| 776 |
+
10900 val loss 6.4832
|
| 777 |
+
10900 val perplexity 654.0450
|
| 778 |
+
10900 train 6.565746 (lr=6.3092e-05) (hash(x)=55943981)
|
| 779 |
+
8700 val loss 6.6440
|
| 780 |
+
8700 val perplexity 768.1542
|
| 781 |
+
8700 train 6.659471 (lr=9.3684e-05) (hash(x)=47902319)
|
| 782 |
+
47400 val loss 5.8019
|
| 783 |
+
47400 val perplexity 330.9364
|
| 784 |
+
47400 train 5.875019 (lr=5.3020e-06) (hash(x)=55562243)
|
| 785 |
+
11000 val loss 6.4434
|
| 786 |
+
11000 val perplexity 628.5189
|
| 787 |
+
11000 train 6.483435 (lr=6.2968e-05) (hash(x)=46444570)
|
| 788 |
+
8800 val loss 6.6390
|
| 789 |
+
8800 val perplexity 764.3397
|
| 790 |
+
8800 train 6.929801 (lr=9.3538e-05) (hash(x)=54904230)
|
| 791 |
+
11100 val loss 6.4500
|
| 792 |
+
11100 val perplexity 632.7070
|
| 793 |
+
11100 train 6.459686 (lr=6.2842e-05) (hash(x)=49589063)
|
| 794 |
+
47500 val loss 5.8001
|
| 795 |
+
47500 val perplexity 330.3456
|
| 796 |
+
47500 train 5.675170 (lr=5.2792e-06) (hash(x)=53544850)
|
| 797 |
+
11200 val loss 6.4506
|
| 798 |
+
11200 val perplexity 633.0538
|
| 799 |
+
11200 train 6.457446 (lr=6.2715e-05) (hash(x)=51392283)
|
| 800 |
+
8900 val loss 6.6381
|
| 801 |
+
8900 val perplexity 763.6228
|
| 802 |
+
8900 train 6.531604 (lr=9.3391e-05) (hash(x)=46311615)
|
| 803 |
+
47600 val loss 5.8024
|
| 804 |
+
47600 val perplexity 331.0871
|
| 805 |
+
47600 train 5.985342 (lr=5.2574e-06) (hash(x)=43634907)
|
| 806 |
+
11300 val loss 6.4364
|
| 807 |
+
11300 val perplexity 624.1540
|
| 808 |
+
11300 train 6.397132 (lr=6.2588e-05) (hash(x)=45081133)
|
| 809 |
+
9000 val loss 6.6386
|
| 810 |
+
9000 val perplexity 764.0009
|
| 811 |
+
9000 train 6.519309 (lr=9.3242e-05) (hash(x)=48535188)
|
| 812 |
+
11400 val loss 6.4289
|
| 813 |
+
11400 val perplexity 619.4864
|
| 814 |
+
11400 train 6.524812 (lr=6.2459e-05) (hash(x)=53700397)
|
| 815 |
+
47700 val loss 5.7999
|
| 816 |
+
47700 val perplexity 330.2570
|
| 817 |
+
47700 train 5.776142 (lr=5.2364e-06) (hash(x)=47909383)
|
| 818 |
+
11500 val loss 6.4207
|
| 819 |
+
11500 val perplexity 614.4413
|
| 820 |
+
11500 train 6.142918 (lr=6.2330e-05) (hash(x)=43839088)
|
| 821 |
+
9100 val loss 6.6102
|
| 822 |
+
9100 val perplexity 742.6683
|
| 823 |
+
9100 train 6.649930 (lr=9.3092e-05) (hash(x)=51757372)
|
| 824 |
+
47800 val loss 5.7953
|
| 825 |
+
47800 val perplexity 328.7346
|
| 826 |
+
47800 train 5.674577 (lr=5.2163e-06) (hash(x)=45871079)
|
| 827 |
+
11600 val loss 6.4324
|
| 828 |
+
11600 val perplexity 621.6376
|
| 829 |
+
11600 train 6.465709 (lr=6.2199e-05) (hash(x)=48088111)
|
| 830 |
+
9200 val loss 6.6127
|
| 831 |
+
9200 val perplexity 744.4720
|
| 832 |
+
9200 train 6.500169 (lr=9.2940e-05) (hash(x)=51131708)
|
| 833 |
+
11700 val loss 6.4299
|
| 834 |
+
11700 val perplexity 620.0857
|
| 835 |
+
11700 train 6.745090 (lr=6.2068e-05) (hash(x)=55108226)
|
| 836 |
+
47900 val loss 5.7951
|
| 837 |
+
47900 val perplexity 328.6861
|
| 838 |
+
47900 train 5.784526 (lr=5.1972e-06) (hash(x)=47333324)
|
| 839 |
+
11800 val loss 6.4176
|
| 840 |
+
11800 val perplexity 612.5077
|
| 841 |
+
11800 train 6.684676 (lr=6.1936e-05) (hash(x)=58524839)
|
| 842 |
+
9300 val loss 6.5997
|
| 843 |
+
9300 val perplexity 734.8544
|
| 844 |
+
9300 train 6.554953 (lr=9.2786e-05) (hash(x)=44784276)
|
| 845 |
+
48000 val loss 5.7948
|
| 846 |
+
48000 val perplexity 328.5829
|
| 847 |
+
48000 train 5.876556 (lr=5.1788e-06) (hash(x)=52758020)
|
| 848 |
+
11900 val loss 6.4150
|
| 849 |
+
11900 val perplexity 610.9667
|
| 850 |
+
11900 train 6.081831 (lr=6.1802e-05) (hash(x)=43864078)
|
| 851 |
+
9400 val loss 6.5917
|
| 852 |
+
9400 val perplexity 729.0008
|
| 853 |
+
9400 train 6.715466 (lr=9.2632e-05) (hash(x)=51981169)
|
| 854 |
+
12000 val loss 6.4093
|
| 855 |
+
12000 val perplexity 607.4685
|
| 856 |
+
12000 train 6.095185 (lr=6.1668e-05) (hash(x)=43448544)
|
| 857 |
+
48100 val loss 5.7937
|
| 858 |
+
48100 val perplexity 328.2226
|
| 859 |
+
48100 train 5.802242 (lr=5.1614e-06) (hash(x)=49806349)
|
| 860 |
+
12100 val loss 6.3978
|
| 861 |
+
12100 val perplexity 600.5402
|
| 862 |
+
12100 train 6.396197 (lr=6.1533e-05) (hash(x)=55200399)
|
| 863 |
+
9500 val loss 6.6014
|
| 864 |
+
9500 val perplexity 736.1489
|
| 865 |
+
9500 train 6.523407 (lr=9.2475e-05) (hash(x)=47232936)
|
| 866 |
+
12200 val loss 6.4067
|
| 867 |
+
12200 val perplexity 605.8679
|
| 868 |
+
48200 val loss 5.7914
|
| 869 |
+
48200 val perplexity 327.4593
|
| 870 |
+
12200 train 6.546592 (lr=6.1397e-05) (hash(x)=57627314)
|
| 871 |
+
48200 train 5.868467 (lr=5.1449e-06) (hash(x)=53220839)
|
| 872 |
+
9600 val loss 6.5844
|
| 873 |
+
9600 val perplexity 723.6813
|
| 874 |
+
9600 train 6.593096 (lr=9.2317e-05) (hash(x)=53800450)
|
| 875 |
+
12300 val loss 6.4145
|
| 876 |
+
12300 val perplexity 610.6370
|
| 877 |
+
12300 train 6.693074 (lr=6.1260e-05) (hash(x)=53617087)
|
| 878 |
+
48300 val loss 5.7940
|
| 879 |
+
48300 val perplexity 328.3305
|
| 880 |
+
48300 train 5.822805 (lr=5.1293e-06) (hash(x)=56052541)
|
| 881 |
+
12400 val loss 6.4177
|
| 882 |
+
12400 val perplexity 612.5649
|
| 883 |
+
12400 train 6.262563 (lr=6.1122e-05) (hash(x)=51135678)
|
| 884 |
+
9700 val loss 6.5799
|
| 885 |
+
9700 val perplexity 720.4314
|
| 886 |
+
9700 train 6.675194 (lr=9.2158e-05) (hash(x)=55768123)
|
| 887 |
+
48400 val loss 5.7905
|
| 888 |
+
48400 val perplexity 327.1818
|
| 889 |
+
48400 train 5.670732 (lr=5.1145e-06) (hash(x)=44482356)
|
| 890 |
+
12500 val loss 6.4237
|
| 891 |
+
12500 val perplexity 616.2940
|
| 892 |
+
12500 train 6.428000 (lr=6.0984e-05) (hash(x)=48025130)
|
| 893 |
+
12600 val loss 6.4127
|
| 894 |
+
12600 val perplexity 609.5201
|
| 895 |
+
9800 val loss 6.5909
|
| 896 |
+
9800 val perplexity 728.4636
|
| 897 |
+
12600 train 6.470107 (lr=6.0844e-05) (hash(x)=52135695)
|
| 898 |
+
9800 train 6.517842 (lr=9.1997e-05) (hash(x)=47745177)
|
| 899 |
+
48500 val loss 5.7897
|
| 900 |
+
48500 val perplexity 326.9112
|
| 901 |
+
48500 train 5.520173 (lr=5.1007e-06) (hash(x)=45714818)
|
| 902 |
+
12700 val loss 6.4382
|
| 903 |
+
12700 val perplexity 625.2830
|
| 904 |
+
12700 train 6.303355 (lr=6.0703e-05) (hash(x)=51888613)
|
| 905 |
+
9900 val loss 6.5974
|
| 906 |
+
9900 val perplexity 733.2065
|
| 907 |
+
9900 train 6.867185 (lr=9.1835e-05) (hash(x)=56592246)
|
| 908 |
+
48600 val loss 5.7913
|
| 909 |
+
48600 val perplexity 327.4302
|
| 910 |
+
48600 train 5.555061 (lr=5.0877e-06) (hash(x)=49476556)
|
| 911 |
+
12800 val loss 6.4182
|
| 912 |
+
12800 val perplexity 612.8937
|
| 913 |
+
12800 train 6.351943 (lr=6.0562e-05) (hash(x)=50418818)
|
| 914 |
+
12900 val loss 6.4053
|
| 915 |
+
12900 val perplexity 605.0304
|
| 916 |
+
12900 train 7.257672 (lr=6.0420e-05) (hash(x)=58649585)
|
| 917 |
+
10000 val loss 6.6087
|
| 918 |
+
10000 val perplexity 741.5094
|
| 919 |
+
10000 train 6.601171 (lr=9.1671e-05) (hash(x)=51655963)
|
| 920 |
+
48700 val loss 5.7916
|
| 921 |
+
48700 val perplexity 327.5210
|
| 922 |
+
48700 train 5.483535 (lr=5.0756e-06) (hash(x)=42508579)
|
| 923 |
+
13000 val loss 6.3998
|
| 924 |
+
13000 val perplexity 601.7260
|
| 925 |
+
13000 train 6.663928 (lr=6.0277e-05) (hash(x)=54567307)
|
| 926 |
+
10100 val loss 6.5917
|
| 927 |
+
10100 val perplexity 728.9938
|
| 928 |
+
10100 train 6.492918 (lr=9.1506e-05) (hash(x)=49809511)
|
| 929 |
+
13100 val loss 6.3887
|
| 930 |
+
13100 val perplexity 595.0941
|
| 931 |
+
13100 train 6.565659 (lr=6.0133e-05) (hash(x)=52071473)
|
| 932 |
+
48800 val loss 5.7920
|
| 933 |
+
48800 val perplexity 327.6678
|
| 934 |
+
48800 train 6.004168 (lr=5.0644e-06) (hash(x)=52737449)
|
| 935 |
+
13200 val loss 6.3823
|
| 936 |
+
13200 val perplexity 591.2615
|
| 937 |
+
13200 train 6.148626 (lr=5.9988e-05) (hash(x)=46293092)
|
| 938 |
+
10200 val loss 6.5790
|
| 939 |
+
10200 val perplexity 719.8549
|
| 940 |
+
10200 train 6.172993 (lr=9.1339e-05) (hash(x)=42297812)
|
| 941 |
+
48900 val loss 5.7912
|
| 942 |
+
48900 val perplexity 327.4165
|
| 943 |
+
48900 train 5.636753 (lr=5.0542e-06) (hash(x)=47057569)
|
| 944 |
+
13300 val loss 6.3882
|
| 945 |
+
13300 val perplexity 594.8053
|
| 946 |
+
13300 train 6.513732 (lr=5.9842e-05) (hash(x)=56511467)
|
| 947 |
+
10300 val loss 6.5718
|
| 948 |
+
10300 val perplexity 714.6194
|
| 949 |
+
10300 train 6.288669 (lr=9.1171e-05) (hash(x)=55529820)
|
| 950 |
+
13400 val loss 6.3714
|
| 951 |
+
13400 val perplexity 584.8615
|
| 952 |
+
13400 train 6.486885 (lr=5.9695e-05) (hash(x)=54753763)
|
| 953 |
+
49000 val loss 5.7940
|
| 954 |
+
49000 val perplexity 328.3136
|
| 955 |
+
49000 train 5.770644 (lr=5.0448e-06) (hash(x)=49908975)
|
| 956 |
+
13500 val loss 6.3630
|
| 957 |
+
13500 val perplexity 579.9861
|
| 958 |
+
13500 train 6.510321 (lr=5.9548e-05) (hash(x)=53610247)
|
| 959 |
+
10400 val loss 6.5765
|
| 960 |
+
10400 val perplexity 717.9875
|
| 961 |
+
10400 train 6.559584 (lr=9.1001e-05) (hash(x)=53255684)
|
| 962 |
+
49100 val loss 5.7920
|
| 963 |
+
49100 val perplexity 327.6564
|
| 964 |
+
49100 train 5.614700 (lr=5.0363e-06) (hash(x)=48427414)
|
| 965 |
+
13600 val loss 6.3478
|
| 966 |
+
13600 val perplexity 571.2138
|
| 967 |
+
13600 train 6.225275 (lr=5.9400e-05) (hash(x)=47526249)
|
| 968 |
+
10500 val loss 6.5377
|
| 969 |
+
10500 val perplexity 690.7025
|
| 970 |
+
10500 train 6.650892 (lr=9.0830e-05) (hash(x)=54306191)
|
| 971 |
+
13700 val loss 6.3612
|
| 972 |
+
13700 val perplexity 578.9458
|
| 973 |
+
13700 train 6.217256 (lr=5.9251e-05) (hash(x)=51185517)
|
| 974 |
+
49200 val loss 5.7897
|
| 975 |
+
49200 val perplexity 326.9223
|
| 976 |
+
49200 train 5.618961 (lr=5.0286e-06) (hash(x)=50246074)
|
| 977 |
+
13800 val loss 6.3604
|
| 978 |
+
13800 val perplexity 578.4911
|
| 979 |
+
13800 train 6.032686 (lr=5.9101e-05) (hash(x)=45953529)
|
| 980 |
+
10600 val loss 6.5441
|
| 981 |
+
10600 val perplexity 695.1240
|
| 982 |
+
10600 train 6.779467 (lr=9.0658e-05) (hash(x)=60130567)
|
| 983 |
+
49300 val loss 5.7902
|
| 984 |
+
49300 val perplexity 327.0729
|
| 985 |
+
49300 train 5.844873 (lr=5.0219e-06) (hash(x)=47715359)
|
| 986 |
+
13900 val loss 6.3799
|
| 987 |
+
13900 val perplexity 589.8560
|
| 988 |
+
13900 train 6.307417 (lr=5.8950e-05) (hash(x)=47238157)
|
| 989 |
+
14000 val loss 6.3578
|
| 990 |
+
14000 val perplexity 577.0012
|
| 991 |
+
14000 train 6.338602 (lr=5.8799e-05) (hash(x)=54250750)
|
| 992 |
+
10700 val loss 6.5191
|
| 993 |
+
10700 val perplexity 677.9451
|
| 994 |
+
10700 train 6.443255 (lr=9.0484e-05) (hash(x)=50074737)
|
| 995 |
+
49400 val loss 5.7951
|
| 996 |
+
49400 val perplexity 328.7013
|
| 997 |
+
49400 train 5.789733 (lr=5.0161e-06) (hash(x)=50175867)
|
| 998 |
+
14100 val loss 6.3644
|
| 999 |
+
14100 val perplexity 580.7674
|
| 1000 |
+
14100 train 6.351201 (lr=5.8646e-05) (hash(x)=48198552)
|
| 1001 |
+
10800 val loss 6.5320
|
| 1002 |
+
10800 val perplexity 686.7475
|
| 1003 |
+
10800 train 6.645676 (lr=9.0308e-05) (hash(x)=51547220)
|
| 1004 |
+
14200 val loss 6.3626
|
| 1005 |
+
14200 val perplexity 579.7433
|
| 1006 |
+
49500 val loss 5.7927
|
| 1007 |
+
49500 val perplexity 327.8968
|
| 1008 |
+
14200 train 6.497223 (lr=5.8493e-05) (hash(x)=52020690)
|
| 1009 |
+
49500 train 5.631345 (lr=5.0112e-06) (hash(x)=49336040)
|
| 1010 |
+
14300 val loss 6.3541
|
| 1011 |
+
14300 val perplexity 574.8251
|
| 1012 |
+
14300 train 6.156728 (lr=5.8339e-05) (hash(x)=45165483)
|
| 1013 |
+
10900 val loss 6.5470
|
| 1014 |
+
10900 val perplexity 697.1835
|
| 1015 |
+
10900 train 6.629424 (lr=9.0132e-05) (hash(x)=55943981)
|
| 1016 |
+
49600 val loss 5.7945
|
| 1017 |
+
49600 val perplexity 328.5037
|
| 1018 |
+
49600 train 5.834286 (lr=5.0072e-06) (hash(x)=52039357)
|
| 1019 |
+
14400 val loss 6.3595
|
| 1020 |
+
14400 val perplexity 577.9749
|
| 1021 |
+
14400 train 6.475995 (lr=5.8184e-05) (hash(x)=52184072)
|
| 1022 |
+
11000 val loss 6.5333
|
| 1023 |
+
11000 val perplexity 687.6558
|
| 1024 |
+
11000 train 6.565971 (lr=8.9954e-05) (hash(x)=46444570)
|
| 1025 |
+
14500 val loss 6.3595
|
| 1026 |
+
14500 val perplexity 577.9520
|
| 1027 |
+
14500 train 6.706707 (lr=5.8029e-05) (hash(x)=56046436)
|
| 1028 |
+
49700 val loss 5.7948
|
| 1029 |
+
49700 val perplexity 328.5858
|
| 1030 |
+
49700 train 5.590436 (lr=5.0040e-06) (hash(x)=47568707)
|
| 1031 |
+
14600 val loss 6.3534
|
| 1032 |
+
14600 val perplexity 574.4607
|
| 1033 |
+
14600 train 6.230230 (lr=5.7872e-05) (hash(x)=52029694)
|
| 1034 |
+
11100 val loss 6.5385
|
| 1035 |
+
11100 val perplexity 691.2264
|
| 1036 |
+
11100 train 6.547019 (lr=8.9774e-05) (hash(x)=49589063)
|
| 1037 |
+
49800 val loss 5.7962
|
| 1038 |
+
49800 val perplexity 329.0321
|
| 1039 |
+
49800 train 5.719766 (lr=5.0018e-06) (hash(x)=48451274)
|
| 1040 |
+
14700 val loss 6.3699
|
| 1041 |
+
14700 val perplexity 584.0244
|
| 1042 |
+
14700 train 6.211578 (lr=5.7715e-05) (hash(x)=50258224)
|
| 1043 |
+
11200 val loss 6.5242
|
| 1044 |
+
11200 val perplexity 681.4029
|
| 1045 |
+
11200 train 6.523566 (lr=8.9593e-05) (hash(x)=51392283)
|
| 1046 |
+
14800 val loss 6.3676
|
| 1047 |
+
14800 val perplexity 582.6866
|
| 1048 |
+
14800 train 5.692614 (lr=5.7558e-05) (hash(x)=42112262)
|
| 1049 |
+
49900 val loss 5.7984
|
| 1050 |
+
49900 val perplexity 329.7762
|
| 1051 |
+
49900 train 5.486496 (lr=5.0004e-06) (hash(x)=44523603)
|
| 1052 |
+
14900 val loss 6.3608
|
| 1053 |
+
14900 val perplexity 578.7267
|
| 1054 |
+
14900 train 6.101720 (lr=5.7399e-05) (hash(x)=47219933)
|
| 1055 |
+
11300 val loss 6.5394
|
| 1056 |
+
11300 val perplexity 691.8585
|
| 1057 |
+
11300 train 6.495648 (lr=8.9411e-05) (hash(x)=45081133)
|
| 1058 |
+
49999 val loss 5.7909
|
| 1059 |
+
49999 val perplexity 327.3206
|
| 1060 |
+
15000 val loss 6.4009
|
| 1061 |
+
15000 val perplexity 602.3866
|
| 1062 |
+
15000 train 6.002666 (lr=5.7240e-05) (hash(x)=58309309)
|
| 1063 |
+
11400 val loss 6.5171
|
| 1064 |
+
11400 val perplexity 676.6274
|
| 1065 |
+
11400 train 6.611968 (lr=8.9227e-05) (hash(x)=53700397)
|
| 1066 |
+
15100 val loss 6.3498
|
| 1067 |
+
15100 val perplexity 572.3857
|
| 1068 |
+
15100 train 6.254695 (lr=5.7079e-05) (hash(x)=48756049)
|
| 1069 |
+
15200 val loss 6.3577
|
| 1070 |
+
15200 val perplexity 576.9346
|
| 1071 |
+
15200 train 6.047930 (lr=5.6919e-05) (hash(x)=49791737)
|
| 1072 |
+
11500 val loss 6.5199
|
| 1073 |
+
11500 val perplexity 678.5172
|
| 1074 |
+
11500 train 6.260948 (lr=8.9043e-05) (hash(x)=43839088)
|
| 1075 |
+
15300 val loss 6.3336
|
| 1076 |
+
15300 val perplexity 563.2054
|
| 1077 |
+
15300 train 6.302131 (lr=5.6757e-05) (hash(x)=53084126)
|
| 1078 |
+
11600 val loss 6.5099
|
| 1079 |
+
11600 val perplexity 671.7664
|
| 1080 |
+
11600 train 6.543172 (lr=8.8856e-05) (hash(x)=48088111)
|
| 1081 |
+
15400 val loss 6.3310
|
| 1082 |
+
15400 val perplexity 561.6938
|
| 1083 |
+
15400 train 6.422942 (lr=5.6595e-05) (hash(x)=55041679)
|
| 1084 |
+
15500 val loss 6.3303
|
| 1085 |
+
15500 val perplexity 561.3511
|
| 1086 |
+
15500 train 6.124517 (lr=5.6432e-05) (hash(x)=43249867)
|
| 1087 |
+
11700 val loss 6.5147
|
| 1088 |
+
11700 val perplexity 674.9705
|
| 1089 |
+
11700 train 6.829165 (lr=8.8668e-05) (hash(x)=55108226)
|
| 1090 |
+
15600 val loss 6.3211
|
| 1091 |
+
15600 val perplexity 556.1926
|
| 1092 |
+
15600 train 6.312618 (lr=5.6268e-05) (hash(x)=49006517)
|
| 1093 |
+
15700 val loss 6.3168
|
| 1094 |
+
15700 val perplexity 553.8163
|
| 1095 |
+
15700 train 6.645555 (lr=5.6104e-05) (hash(x)=60986839)
|
| 1096 |
+
11800 val loss 6.5320
|
| 1097 |
+
11800 val perplexity 686.7426
|
| 1098 |
+
11800 train 6.764431 (lr=8.8479e-05) (hash(x)=58524839)
|
| 1099 |
+
15800 val loss 6.3371
|
| 1100 |
+
15800 val perplexity 565.1663
|
| 1101 |
+
15800 train 6.246183 (lr=5.5938e-05) (hash(x)=48354906)
|
| 1102 |
+
11900 val loss 6.5044
|
| 1103 |
+
11900 val perplexity 668.0739
|
| 1104 |
+
11900 train 6.170547 (lr=8.8289e-05) (hash(x)=43864078)
|
| 1105 |
+
15900 val loss 6.3422
|
| 1106 |
+
15900 val perplexity 568.0605
|
| 1107 |
+
15900 train 6.207072 (lr=5.5773e-05) (hash(x)=52679780)
|
| 1108 |
+
16000 val loss 6.3104
|
| 1109 |
+
16000 val perplexity 550.2414
|
| 1110 |
+
16000 train 6.475496 (lr=5.5606e-05) (hash(x)=58049587)
|
| 1111 |
+
12000 val loss 6.5055
|
| 1112 |
+
12000 val perplexity 668.7924
|
| 1113 |
+
12000 train 6.222349 (lr=8.8097e-05) (hash(x)=43448544)
|
attention_kindselective_n_heads4_seed1340/model_02500.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 92843394
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e856700b2ba8c1c78fec371f511980a9165c3a304e65b1e2ecf19f9a9ba2c4f5
|
| 3 |
size 92843394
|
attention_kindselective_n_heads4_seed1340/model_05000.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 92843394
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ce0f91188a61cf2a20045bdc8d3dc3070278a30ac99975978bf3d232dd03f70
|
| 3 |
size 92843394
|
attention_kindselective_n_heads4_seed1340/model_07500.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 92843394
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21edaf2540d7212d65d0bd210419f7d6a0f9281cde1d06f864c3ec1d63fff124
|
| 3 |
size 92843394
|
attention_kindselective_n_heads4_seed1340/model_10000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e28d809d0626f302c7278753b8c13072c86cc80bea7caeb91264050622dd5e7
|
| 3 |
+
size 92843394
|
attention_kindselective_n_heads4_seed1340/model_12500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69a6cd3908266914a289ba86b8a7d2af8ce51bf4034f6b670c6730502d6f65d4
|
| 3 |
+
size 92843394
|
attention_kindselective_n_heads4_seed1340/model_42500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e937e5bde413b5f1d5cd2e7a03428c77cca6ac8ef7547cc1d8aa0565c33ea0d
|
| 3 |
+
size 92843394
|
attention_kindselective_n_heads4_seed1340/model_45000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:03ef3fa0f39e905fced4c8e9f6d4c4c36d3dfdec8a8589b65a5f4c6363469015
|
| 3 |
+
size 92843394
|
attention_kindselective_n_heads4_seed1340/model_47500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58579aca525c6640a50ed6de3f5bc997ccf568b1c74ac74080a8413181c72725
|
| 3 |
+
size 92843394
|
attention_kindselective_n_heads4_seed1340/model_49999.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ddd2671bc53d3d47c2a3597b04a352fe6e56e1aa82a19bed3ee6f4e205890842
|
| 3 |
+
size 92843394
|
attention_kindselective_n_heads4_seed1340/optimizer_02500.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 179406214
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71d5d80bc8621c01c7517a9c8966efcf316b98161adb2e069c5201828111ea9b
|
| 3 |
size 179406214
|
attention_kindselective_n_heads4_seed1340/optimizer_05000.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 179406214
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4cf0986c29b0869c7e9016f8a369414e67e321e3e73344a91dc6a3451b88ff99
|
| 3 |
size 179406214
|
attention_kindselective_n_heads4_seed1340/optimizer_07500.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 179406214
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f45a3b4647bd40c9e04b365a1e5c1703f4207fa3698dc385fc2cfd5bc0ea4bd5
|
| 3 |
size 179406214
|
attention_kindselective_n_heads4_seed1340/optimizer_10000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06bc9c8845c72cf18f14e45b43f7c4c19b17aa50a69cff163f981ea53f88744f
|
| 3 |
+
size 179406214
|
attention_kindselective_n_heads4_seed1340/optimizer_12500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61803c8687b0d4b6de778e759324ef063faecd26bcfb6e193e45505fe759b8d9
|
| 3 |
+
size 179406214
|
attention_kindselective_n_heads4_seed1340/optimizer_42500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dffbcca299af84d6d052050b7c9c7230f9052fbb489bbc7104607f111ccd656e
|
| 3 |
+
size 179406214
|
attention_kindselective_n_heads4_seed1340/optimizer_45000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b4dba3c28b0a3140da1cda29187583bfb0ffbc5b49bd20693dae0ef1a68772c
|
| 3 |
+
size 179406214
|
attention_kindselective_n_heads4_seed1340/optimizer_47500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee4537d10c929400a183bc037015dc153913766e3bc65eeecde61d2e9ef87311
|
| 3 |
+
size 179406214
|
attention_kindselective_n_heads4_seed1340/optimizer_49999.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5280d94beeaac57235e9ca63bb2521ca98af393b93180fe9a674caafe9bb168
|
| 3 |
+
size 179406214
|