Byte-lingua-code / superbpe /tokenizers_superbpe /bindings /node /examples /documentation /pipeline.test.ts
| /* eslint-disable */ | |
| var globRequire = require; | |
| describe("pipelineExample", () => { | |
| // This is a hack to let us require using path similar to what the user has to use | |
| function require(mod: string) { | |
| if (mod.startsWith("tokenizers")) { | |
| // let path = mod.slice("tokenizers".length); | |
| return globRequire("../../"); | |
| } else { | |
| return globRequire(mod); | |
| } | |
| } | |
| let console = { | |
| log: (..._args: any[]) => {} | |
| }; | |
| it("shows pipeline parts", async () => { | |
| // START reload_tokenizer | |
| let { Tokenizer } = require("tokenizers"); | |
| let tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json"); | |
| // END reload_tokenizer | |
| // START setup_normalizer | |
| let { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers"); | |
| let normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]); | |
| // END setup_normalizer | |
| // START test_normalizer | |
| let normalized = normalizer.normalizeString("Héllò hôw are ü?") | |
| // "Hello how are u?" | |
| // END test_normalizer | |
| expect(normalized).toEqual("Hello how are u?"); | |
| // START replace_normalizer | |
| tokenizer.setNormalizer(normalizer) | |
| // END replace_normalizer | |
| // START setup_pre_tokenizer | |
| let { whitespacePreTokenizer } = require("tokenizers"); | |
| var preTokenizer = whitespacePreTokenizer(); | |
| var preTokenized = preTokenizer.preTokenizeString("Hello! How are you? I'm fine, thank you."); | |
| // END setup_pre_tokenizer | |
| expect(preTokenized).toEqual([ | |
| ["Hello", [0, 5]], | |
| ["!", [5, 6]], | |
| ["How", [7, 10]], | |
| ["are", [11, 14]], | |
| ["you", [15, 18]], | |
| ["?", [18, 19]], | |
| ["I", [20, 21]], | |
| ["'", [21, 22]], | |
| ['m', [22, 23]], | |
| ["fine", [24, 28]], | |
| [",", [28, 29]], | |
| ["thank", [30, 35]], | |
| ["you", [36, 39]], | |
| [".", [39, 40]] | |
| ]); | |
| // START combine_pre_tokenizer | |
| let { sequencePreTokenizer, digitsPreTokenizer } = require("tokenizers"); | |
| var preTokenizer = sequencePreTokenizer([whitespacePreTokenizer(), digitsPreTokenizer(true)]); | |
| var preTokenized = preTokenizer.preTokenizeString("Call 911!"); | |
| // END combine_pre_tokenizer | |
| // START replace_pre_tokenizer | |
| tokenizer.setPreTokenizer(preTokenizer) | |
| // END replace_pre_tokenizer | |
| // START setup_processor | |
| let { templateProcessing } = require("tokenizers"); | |
| tokenizer.setPostProcessor(templateProcessing( | |
| "[CLS] $A [SEP]", | |
| "[CLS] $A [SEP] $B:1 [SEP]:1", | |
| [["[CLS]", 1], ["[SEP]", 2]] | |
| )); | |
| // END setup_processor | |
| // START test_decoding | |
| let output = await tokenizer.encode("Hello, y'all! How are you 😁 ?"); | |
| console.log(output.getIds()); | |
| // [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2] | |
| let decoded = await tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2], true); | |
| // "Hello , y ' all ! How are you ?" | |
| // END test_decoding | |
| expect(decoded).toEqual("Hello , y ' all ! How are you ?"); | |
| }); | |
| it.skip("trains the tokenizer", async () => { | |
| // START bert_setup_tokenizer | |
| let { Tokenizer } = require("tokenizers"); | |
| let { WordPiece } = require("tokenizers"); | |
| let bertTokenizer = new Tokenizer(WordPiece.init({}, { unkToken: "[UNK]" })); | |
| // END bert_setup_tokenizer | |
| // START bert_setup_normalizer | |
| let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer } | |
| = require("tokenizers"); | |
| bertTokenizer.setNormalizer(sequenceNormalizer([ | |
| nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer() | |
| ])) | |
| // END bert_setup_normalizer | |
| // START bert_setup_pre_tokenizer | |
| let { whitespacePreTokenizer } = require("tokenizers"); | |
| bertTokenizer.setPreTokenizer(whitespacePreTokenizer()); | |
| // END bert_setup_pre_tokenizer | |
| // START bert_setup_processor | |
| let { templateProcessing } = require("tokenizers"); | |
| bertTokenizer.setPostProcessor(templateProcessing( | |
| "[CLS] $A [SEP]", | |
| "[CLS] $A [SEP] $B:1 [SEP]:1", | |
| [["[CLS]", 1], ["[SEP]", 2]] | |
| )); | |
| // END bert_setup_processor | |
| // START bert_train_tokenizer | |
| let { wordPieceTrainer } = require("tokenizers"); | |
| let trainer = wordPieceTrainer({ | |
| vocabSize: 30522, | |
| specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] | |
| }); | |
| let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`); | |
| bertTokenizer.train(files, trainer); | |
| bertTokenizer.save("data/bert-wiki.json") | |
| // END bert_train_tokenizer | |
| }); | |
| it("shows a full bert example", async () => { | |
| let { Tokenizer } = require("tokenizers"); | |
| let bertTokenizer = await Tokenizer.fromFile("data/bert-wiki.json") | |
| // START bert_test_decoding | |
| let output = await bertTokenizer.encode("Welcome to the 🤗 Tokenizers library."); | |
| console.log(output.getTokens()); | |
| // ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"] | |
| var decoded = await bertTokenizer.decode(output.getIds(), true); | |
| // "welcome to the tok ##eni ##zer ##s library ." | |
| // END bert_test_decoding | |
| expect(decoded).toEqual("welcome to the tok ##eni ##zer ##s library ."); | |
| // START bert_proper_decoding | |
| let { wordPieceDecoder } = require("tokenizers"); | |
| bertTokenizer.setDecoder(wordPieceDecoder()); | |
| var decoded = await bertTokenizer.decode(output.getIds(), true); | |
| // "welcome to the tokenizers library." | |
| // END bert_proper_decoding | |
| expect(decoded).toEqual("welcome to the tokenizers library."); | |
| }); | |
| }); | |