Upload folder using huggingface_hub
Browse files
fw57M_Surprisal_thresholdB_16000/tokenizer.json
CHANGED
|
@@ -40,7 +40,10 @@
|
|
| 40 |
]
|
| 41 |
},
|
| 42 |
"pre_tokenizer": {
|
| 43 |
-
"type": "
|
|
|
|
|
|
|
|
|
|
| 44 |
},
|
| 45 |
"post_processor": {
|
| 46 |
"type": "ByteLevel",
|
|
@@ -318,262 +321,262 @@
|
|
| 318 |
"Ł": 255,
|
| 319 |
"ł": 256,
|
| 320 |
"Ń": 257,
|
| 321 |
-
"
|
| 322 |
"##z": 259,
|
| 323 |
-
"
|
| 324 |
-
"
|
| 325 |
-
"
|
| 326 |
-
"
|
| 327 |
-
"
|
| 328 |
-
"##
|
| 329 |
-
"
|
| 330 |
-
"##
|
| 331 |
-
"
|
| 332 |
-
"##
|
| 333 |
-
"
|
| 334 |
-
"
|
| 335 |
-
"
|
| 336 |
-
"
|
| 337 |
-
"
|
| 338 |
-
"
|
| 339 |
-
"
|
| 340 |
-
"
|
| 341 |
-
"
|
| 342 |
-
"##
|
| 343 |
-
"
|
| 344 |
-
"
|
| 345 |
-
"
|
| 346 |
-
"
|
| 347 |
-
"
|
| 348 |
-
"
|
| 349 |
-
"
|
| 350 |
-
"
|
| 351 |
-
"
|
| 352 |
-
"
|
| 353 |
-
"
|
| 354 |
-
"
|
| 355 |
-
"
|
| 356 |
-
"
|
| 357 |
-
"
|
| 358 |
-
"
|
| 359 |
-
"
|
| 360 |
-
"
|
| 361 |
-
"
|
| 362 |
-
"
|
| 363 |
-
"##
|
| 364 |
-
"
|
| 365 |
-
"
|
| 366 |
-
"
|
| 367 |
-
"
|
| 368 |
-
"##
|
| 369 |
-
"
|
| 370 |
-
"
|
| 371 |
-
"
|
| 372 |
-
"
|
| 373 |
-
"
|
| 374 |
-
"
|
| 375 |
-
"
|
| 376 |
-
"
|
| 377 |
-
"
|
| 378 |
-
"
|
| 379 |
-
"
|
| 380 |
-
"
|
| 381 |
-
"
|
| 382 |
-
"##
|
| 383 |
-
"
|
| 384 |
-
"
|
| 385 |
-
"
|
| 386 |
-
"
|
| 387 |
-
"
|
| 388 |
-
"
|
| 389 |
-
"##
|
| 390 |
-
"
|
| 391 |
-
"##
|
| 392 |
-
"
|
| 393 |
-
"
|
| 394 |
-
"
|
| 395 |
-
"
|
| 396 |
-
"
|
| 397 |
-
"
|
| 398 |
-
"
|
| 399 |
-
"
|
| 400 |
-
"
|
| 401 |
-
"
|
| 402 |
-
"
|
| 403 |
-
"
|
| 404 |
-
"
|
| 405 |
-
"
|
| 406 |
-
"
|
| 407 |
-
"
|
| 408 |
-
"
|
| 409 |
-
"
|
| 410 |
-
"
|
| 411 |
-
"
|
| 412 |
-
"
|
| 413 |
-
"##
|
| 414 |
-
"
|
| 415 |
-
"
|
| 416 |
-
"
|
| 417 |
-
"
|
| 418 |
-
"
|
| 419 |
-
"
|
| 420 |
-
"
|
| 421 |
-
"
|
| 422 |
-
"
|
| 423 |
-
"
|
| 424 |
-
"
|
| 425 |
-
"
|
| 426 |
-
"
|
| 427 |
-
"
|
| 428 |
-
"
|
| 429 |
-
"
|
| 430 |
-
"
|
| 431 |
-
"
|
| 432 |
-
"
|
| 433 |
-
"
|
| 434 |
-
"
|
| 435 |
-
"
|
| 436 |
-
"
|
| 437 |
-
"
|
| 438 |
-
"
|
| 439 |
-
"
|
| 440 |
-
"
|
| 441 |
-
"
|
| 442 |
-
"
|
| 443 |
-
"
|
| 444 |
-
"
|
| 445 |
-
"
|
| 446 |
-
"
|
| 447 |
-
"
|
| 448 |
-
"
|
| 449 |
-
"
|
| 450 |
-
"
|
| 451 |
-
"
|
| 452 |
-
"
|
| 453 |
-
"
|
| 454 |
-
"
|
| 455 |
-
"
|
| 456 |
-
"
|
| 457 |
-
"
|
| 458 |
-
"
|
| 459 |
-
"
|
| 460 |
-
"
|
| 461 |
-
"
|
| 462 |
-
"
|
| 463 |
-
"##
|
| 464 |
-
"
|
| 465 |
-
"
|
| 466 |
-
"##
|
| 467 |
-
"
|
| 468 |
-
"
|
| 469 |
-
"
|
| 470 |
-
"
|
| 471 |
-
"
|
| 472 |
-
"
|
| 473 |
-
"
|
| 474 |
-
"
|
| 475 |
-
"
|
| 476 |
-
"
|
| 477 |
-
"
|
| 478 |
-
"
|
| 479 |
-
"
|
| 480 |
-
"
|
| 481 |
-
"
|
| 482 |
-
"
|
| 483 |
-
"
|
| 484 |
-
"
|
| 485 |
-
"
|
| 486 |
-
"
|
| 487 |
-
"
|
| 488 |
-
"##
|
| 489 |
-
"
|
| 490 |
-
"
|
| 491 |
-
"
|
| 492 |
-
"##
|
| 493 |
-
"
|
| 494 |
-
"
|
| 495 |
-
"
|
| 496 |
-
"
|
| 497 |
-
"
|
| 498 |
-
"
|
| 499 |
-
"
|
| 500 |
-
"
|
| 501 |
-
"
|
| 502 |
-
"
|
| 503 |
-
"
|
| 504 |
-
"##
|
| 505 |
-
"
|
| 506 |
-
"
|
| 507 |
-
"
|
| 508 |
-
"
|
| 509 |
-
"
|
| 510 |
-
"
|
| 511 |
-
"
|
| 512 |
-
"
|
| 513 |
-
"
|
| 514 |
-
"
|
| 515 |
-
"
|
| 516 |
-
"
|
| 517 |
-
"
|
| 518 |
-
"
|
| 519 |
-
"
|
| 520 |
-
"
|
| 521 |
-
"
|
| 522 |
-
"
|
| 523 |
-
"
|
| 524 |
-
"##
|
| 525 |
-
"
|
| 526 |
-
"##
|
| 527 |
-
"
|
| 528 |
-
"
|
| 529 |
-
"
|
| 530 |
-
"
|
| 531 |
-
"
|
| 532 |
-
"
|
| 533 |
-
"
|
| 534 |
-
"
|
| 535 |
-
"##
|
| 536 |
-
"
|
| 537 |
-
"
|
| 538 |
-
"
|
| 539 |
-
"
|
| 540 |
-
"
|
| 541 |
-
"
|
| 542 |
-
"
|
| 543 |
-
"
|
| 544 |
-
"
|
| 545 |
-
"
|
| 546 |
-
"
|
| 547 |
-
"##
|
| 548 |
-
"##
|
| 549 |
-
"
|
| 550 |
-
"
|
| 551 |
-
"
|
| 552 |
-
"
|
| 553 |
-
"##
|
| 554 |
-
"
|
| 555 |
-
"
|
| 556 |
-
"
|
| 557 |
-
"
|
| 558 |
-
"
|
| 559 |
-
"
|
| 560 |
-
"
|
| 561 |
-
"
|
| 562 |
-
"
|
| 563 |
-
"
|
| 564 |
-
"
|
| 565 |
-
"
|
| 566 |
-
"
|
| 567 |
-
"
|
| 568 |
-
"
|
| 569 |
-
"
|
| 570 |
-
"
|
| 571 |
-
"
|
| 572 |
-
"
|
| 573 |
-
"
|
| 574 |
-
"
|
| 575 |
-
"
|
| 576 |
-
"
|
| 577 |
"<|unk|>": 514,
|
| 578 |
"##ng": 515,
|
| 579 |
"##er": 516,
|
|
|
|
| 40 |
]
|
| 41 |
},
|
| 42 |
"pre_tokenizer": {
|
| 43 |
+
"type": "ByteLevel",
|
| 44 |
+
"add_prefix_space": true,
|
| 45 |
+
"trim_offsets": true,
|
| 46 |
+
"use_regex": true
|
| 47 |
},
|
| 48 |
"post_processor": {
|
| 49 |
"type": "ByteLevel",
|
|
|
|
| 321 |
"Ł": 255,
|
| 322 |
"ł": 256,
|
| 323 |
"Ń": 257,
|
| 324 |
+
"##>": 258,
|
| 325 |
"##z": 259,
|
| 326 |
+
"##Ï": 260,
|
| 327 |
+
"##ú": 261,
|
| 328 |
+
"##M": 262,
|
| 329 |
+
"##^": 263,
|
| 330 |
+
"##¢": 264,
|
| 331 |
+
"##T": 265,
|
| 332 |
+
"##.": 266,
|
| 333 |
+
"##6": 267,
|
| 334 |
+
"##:": 268,
|
| 335 |
+
"##Z": 269,
|
| 336 |
+
"##¯": 270,
|
| 337 |
+
"##È": 271,
|
| 338 |
+
"##q": 272,
|
| 339 |
+
"##ç": 273,
|
| 340 |
+
"##ò": 274,
|
| 341 |
+
"##9": 275,
|
| 342 |
+
"##ì": 276,
|
| 343 |
+
"##ě": 277,
|
| 344 |
+
"##k": 278,
|
| 345 |
+
"##u": 279,
|
| 346 |
+
"##$": 280,
|
| 347 |
+
"##ĥ": 281,
|
| 348 |
+
"##ē": 282,
|
| 349 |
+
"##-": 283,
|
| 350 |
+
"##Å": 284,
|
| 351 |
+
"##G": 285,
|
| 352 |
+
"##b": 286,
|
| 353 |
+
"##Ð": 287,
|
| 354 |
+
"##ğ": 288,
|
| 355 |
+
"##Â": 289,
|
| 356 |
+
"##č": 290,
|
| 357 |
+
"##÷": 291,
|
| 358 |
+
"##`": 292,
|
| 359 |
+
"##ĩ": 293,
|
| 360 |
+
"##Ù": 294,
|
| 361 |
+
"##ď": 295,
|
| 362 |
+
"##ä": 296,
|
| 363 |
+
"##¸": 297,
|
| 364 |
+
"##ê": 298,
|
| 365 |
+
"##¾": 299,
|
| 366 |
+
"##X": 300,
|
| 367 |
+
"##Ğ": 301,
|
| 368 |
+
"##+": 302,
|
| 369 |
+
"##Ü": 303,
|
| 370 |
+
"##ą": 304,
|
| 371 |
+
"##g": 305,
|
| 372 |
+
"##À": 306,
|
| 373 |
+
"##Ĉ": 307,
|
| 374 |
+
"##&": 308,
|
| 375 |
+
"##h": 309,
|
| 376 |
+
"##ñ": 310,
|
| 377 |
+
"##İ": 311,
|
| 378 |
+
"##%": 312,
|
| 379 |
+
"##ã": 313,
|
| 380 |
+
"##Í": 314,
|
| 381 |
+
"##Ď": 315,
|
| 382 |
+
"##H": 316,
|
| 383 |
+
"##ª": 317,
|
| 384 |
+
"##Ø": 318,
|
| 385 |
+
"##7": 319,
|
| 386 |
+
"##®": 320,
|
| 387 |
+
"##´": 321,
|
| 388 |
+
"##õ": 322,
|
| 389 |
+
"##¦": 323,
|
| 390 |
+
"##ı": 324,
|
| 391 |
+
"##¬": 325,
|
| 392 |
+
"##E": 326,
|
| 393 |
+
"##Ñ": 327,
|
| 394 |
+
"##3": 328,
|
| 395 |
+
"##é": 329,
|
| 396 |
+
"##ï": 330,
|
| 397 |
+
"##Ć": 331,
|
| 398 |
+
"##ħ": 332,
|
| 399 |
+
"##_": 333,
|
| 400 |
+
"##×": 334,
|
| 401 |
+
"##á": 335,
|
| 402 |
+
"##Ę": 336,
|
| 403 |
+
"##ļ": 337,
|
| 404 |
+
"##¿": 338,
|
| 405 |
+
"##¶": 339,
|
| 406 |
+
"##ă": 340,
|
| 407 |
+
"##ð": 341,
|
| 408 |
+
"##±": 342,
|
| 409 |
+
"##Ľ": 343,
|
| 410 |
+
"##R": 344,
|
| 411 |
+
"##Ö": 345,
|
| 412 |
+
"##Ĵ": 346,
|
| 413 |
+
"##ü": 347,
|
| 414 |
+
"##U": 348,
|
| 415 |
+
"##]": 349,
|
| 416 |
+
"##8": 350,
|
| 417 |
+
"##í": 351,
|
| 418 |
+
"##I": 352,
|
| 419 |
+
"##ø": 353,
|
| 420 |
+
"##F": 354,
|
| 421 |
+
"##æ": 355,
|
| 422 |
+
"##Ā": 356,
|
| 423 |
+
"##ĭ": 357,
|
| 424 |
+
"##m": 358,
|
| 425 |
+
"##è": 359,
|
| 426 |
+
"###": 360,
|
| 427 |
+
"##L": 361,
|
| 428 |
+
"##S": 362,
|
| 429 |
+
"##[": 363,
|
| 430 |
+
"##ë": 364,
|
| 431 |
+
"##i": 365,
|
| 432 |
+
"##IJ": 366,
|
| 433 |
+
"##c": 367,
|
| 434 |
+
"##(": 368,
|
| 435 |
+
"##Ġ": 369,
|
| 436 |
+
"##5": 370,
|
| 437 |
+
"##ċ": 371,
|
| 438 |
+
"##ó": 372,
|
| 439 |
+
"##s": 373,
|
| 440 |
+
"##å": 374,
|
| 441 |
+
"##»": 375,
|
| 442 |
+
"##~": 376,
|
| 443 |
+
"##Ń": 377,
|
| 444 |
+
"##į": 378,
|
| 445 |
+
"##C": 379,
|
| 446 |
+
"##p": 380,
|
| 447 |
+
"##*": 381,
|
| 448 |
+
"##@": 382,
|
| 449 |
+
"##ę": 383,
|
| 450 |
+
"##ė": 384,
|
| 451 |
+
"##Ĝ": 385,
|
| 452 |
+
"##à": 386,
|
| 453 |
+
"##V": 387,
|
| 454 |
+
"##Č": 388,
|
| 455 |
+
"##ö": 389,
|
| 456 |
+
"##?": 390,
|
| 457 |
+
"##Ħ": 391,
|
| 458 |
+
"##ī": 392,
|
| 459 |
+
"##Ĭ": 393,
|
| 460 |
+
"##Ē": 394,
|
| 461 |
+
"##e": 395,
|
| 462 |
+
"##Ú": 396,
|
| 463 |
+
"##Ò": 397,
|
| 464 |
+
"##ô": 398,
|
| 465 |
+
"##£": 399,
|
| 466 |
+
"##)": 400,
|
| 467 |
+
"##ł": 401,
|
| 468 |
+
"##Ĺ": 402,
|
| 469 |
+
"##{": 403,
|
| 470 |
+
"##°": 404,
|
| 471 |
+
"##x": 405,
|
| 472 |
+
"##N": 406,
|
| 473 |
+
"##Ä": 407,
|
| 474 |
+
"##ģ": 408,
|
| 475 |
+
"##!": 409,
|
| 476 |
+
"##µ": 410,
|
| 477 |
+
"##Ă": 411,
|
| 478 |
+
"##·": 412,
|
| 479 |
+
"##ÿ": 413,
|
| 480 |
+
"##v": 414,
|
| 481 |
+
"##J": 415,
|
| 482 |
+
"##a": 416,
|
| 483 |
+
"##³": 417,
|
| 484 |
+
"##Õ": 418,
|
| 485 |
+
"##0": 419,
|
| 486 |
+
"##û": 420,
|
| 487 |
+
"##4": 421,
|
| 488 |
+
"##Ī": 422,
|
| 489 |
+
"##Ķ": 423,
|
| 490 |
+
"##ć": 424,
|
| 491 |
+
"##w": 425,
|
| 492 |
+
"##©": 426,
|
| 493 |
+
"##ĸ": 427,
|
| 494 |
+
"##Ĕ": 428,
|
| 495 |
+
"##2": 429,
|
| 496 |
+
"##ý": 430,
|
| 497 |
+
"##É": 431,
|
| 498 |
+
"##n": 432,
|
| 499 |
+
"##Ĩ": 433,
|
| 500 |
+
"##ij": 434,
|
| 501 |
+
"##/": 435,
|
| 502 |
+
"##l": 436,
|
| 503 |
+
"##Û": 437,
|
| 504 |
+
"##Ã": 438,
|
| 505 |
+
"##=": 439,
|
| 506 |
+
"##ľ": 440,
|
| 507 |
+
"##r": 441,
|
| 508 |
+
"##K": 442,
|
| 509 |
+
"##¤": 443,
|
| 510 |
+
"##Đ": 444,
|
| 511 |
+
"##²": 445,
|
| 512 |
+
"##ġ": 446,
|
| 513 |
+
"##Y": 447,
|
| 514 |
+
"##ù": 448,
|
| 515 |
+
"##Ł": 449,
|
| 516 |
+
"##,": 450,
|
| 517 |
+
"##Ô": 451,
|
| 518 |
+
"##þ": 452,
|
| 519 |
+
"##º": 453,
|
| 520 |
+
"##P": 454,
|
| 521 |
+
"##j": 455,
|
| 522 |
+
"##§": 456,
|
| 523 |
+
"##¹": 457,
|
| 524 |
+
"##Ë": 458,
|
| 525 |
+
"##Á": 459,
|
| 526 |
+
"##â": 460,
|
| 527 |
+
"##'": 461,
|
| 528 |
+
"##¥": 462,
|
| 529 |
+
"##A": 463,
|
| 530 |
+
"##Ê": 464,
|
| 531 |
+
"##ā": 465,
|
| 532 |
+
"##ß": 466,
|
| 533 |
+
"##Ė": 467,
|
| 534 |
+
"##Į": 468,
|
| 535 |
+
"##Ģ": 469,
|
| 536 |
+
"##B": 470,
|
| 537 |
+
"##Ċ": 471,
|
| 538 |
+
"##}": 472,
|
| 539 |
+
"##î": 473,
|
| 540 |
+
"##Ě": 474,
|
| 541 |
+
"##f": 475,
|
| 542 |
+
"##Ç": 476,
|
| 543 |
+
"##<": 477,
|
| 544 |
+
"##½": 478,
|
| 545 |
+
"##¡": 479,
|
| 546 |
+
"##W": 480,
|
| 547 |
+
"##t": 481,
|
| 548 |
+
"##đ": 482,
|
| 549 |
+
"##Ŀ": 483,
|
| 550 |
+
"##1": 484,
|
| 551 |
+
"##d": 485,
|
| 552 |
+
"##Ì": 486,
|
| 553 |
+
"##Ý": 487,
|
| 554 |
+
"##Ą": 488,
|
| 555 |
+
"##o": 489,
|
| 556 |
+
"##y": 490,
|
| 557 |
+
"##\"": 491,
|
| 558 |
+
"##¨": 492,
|
| 559 |
+
"##«": 493,
|
| 560 |
+
"##ĝ": 494,
|
| 561 |
+
"##ķ": 495,
|
| 562 |
+
"##;": 496,
|
| 563 |
+
"##Q": 497,
|
| 564 |
+
"##O": 498,
|
| 565 |
+
"##ĺ": 499,
|
| 566 |
+
"##\\": 500,
|
| 567 |
+
"##Ó": 501,
|
| 568 |
+
"##ĵ": 502,
|
| 569 |
+
"##|": 503,
|
| 570 |
+
"##Î": 504,
|
| 571 |
+
"##Æ": 505,
|
| 572 |
+
"##¼": 506,
|
| 573 |
+
"##Ĥ": 507,
|
| 574 |
+
"##Þ": 508,
|
| 575 |
+
"##Ļ": 509,
|
| 576 |
+
"##ĕ": 510,
|
| 577 |
+
"##D": 511,
|
| 578 |
+
"##ĉ": 512,
|
| 579 |
+
"##ŀ": 513,
|
| 580 |
"<|unk|>": 514,
|
| 581 |
"##ng": 515,
|
| 582 |
"##er": 516,
|
fw57M_Surprisal_thresholdB_16000/vocab.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|