File size: 6,126 Bytes
72c0672 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | /* eslint-disable */
var globRequire = require;
describe("pipelineExample", () => {
// This is a hack to let us require using path similar to what the user has to use
function require(mod: string) {
if (mod.startsWith("tokenizers")) {
// let path = mod.slice("tokenizers".length);
return globRequire("../../");
} else {
return globRequire(mod);
}
}
let console = {
log: (..._args: any[]) => {}
};
it("shows pipeline parts", async () => {
// START reload_tokenizer
let { Tokenizer } = require("tokenizers");
let tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
// END reload_tokenizer
// START setup_normalizer
let { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers");
let normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
// END setup_normalizer
// START test_normalizer
let normalized = normalizer.normalizeString("Héllò hôw are ü?")
// "Hello how are u?"
// END test_normalizer
expect(normalized).toEqual("Hello how are u?");
// START replace_normalizer
tokenizer.setNormalizer(normalizer)
// END replace_normalizer
// START setup_pre_tokenizer
let { whitespacePreTokenizer } = require("tokenizers");
var preTokenizer = whitespacePreTokenizer();
var preTokenized = preTokenizer.preTokenizeString("Hello! How are you? I'm fine, thank you.");
// END setup_pre_tokenizer
expect(preTokenized).toEqual([
["Hello", [0, 5]],
["!", [5, 6]],
["How", [7, 10]],
["are", [11, 14]],
["you", [15, 18]],
["?", [18, 19]],
["I", [20, 21]],
["'", [21, 22]],
['m', [22, 23]],
["fine", [24, 28]],
[",", [28, 29]],
["thank", [30, 35]],
["you", [36, 39]],
[".", [39, 40]]
]);
// START combine_pre_tokenizer
let { sequencePreTokenizer, digitsPreTokenizer } = require("tokenizers");
var preTokenizer = sequencePreTokenizer([whitespacePreTokenizer(), digitsPreTokenizer(true)]);
var preTokenized = preTokenizer.preTokenizeString("Call 911!");
// END combine_pre_tokenizer
// START replace_pre_tokenizer
tokenizer.setPreTokenizer(preTokenizer)
// END replace_pre_tokenizer
// START setup_processor
let { templateProcessing } = require("tokenizers");
tokenizer.setPostProcessor(templateProcessing(
"[CLS] $A [SEP]",
"[CLS] $A [SEP] $B:1 [SEP]:1",
[["[CLS]", 1], ["[SEP]", 2]]
));
// END setup_processor
// START test_decoding
let output = await tokenizer.encode("Hello, y'all! How are you 😁 ?");
console.log(output.getIds());
// [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
let decoded = await tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2], true);
// "Hello , y ' all ! How are you ?"
// END test_decoding
expect(decoded).toEqual("Hello , y ' all ! How are you ?");
});
it.skip("trains the tokenizer", async () => {
// START bert_setup_tokenizer
let { Tokenizer } = require("tokenizers");
let { WordPiece } = require("tokenizers");
let bertTokenizer = new Tokenizer(WordPiece.init({}, { unkToken: "[UNK]" }));
// END bert_setup_tokenizer
// START bert_setup_normalizer
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
= require("tokenizers");
bertTokenizer.setNormalizer(sequenceNormalizer([
nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer()
]))
// END bert_setup_normalizer
// START bert_setup_pre_tokenizer
let { whitespacePreTokenizer } = require("tokenizers");
bertTokenizer.setPreTokenizer(whitespacePreTokenizer());
// END bert_setup_pre_tokenizer
// START bert_setup_processor
let { templateProcessing } = require("tokenizers");
bertTokenizer.setPostProcessor(templateProcessing(
"[CLS] $A [SEP]",
"[CLS] $A [SEP] $B:1 [SEP]:1",
[["[CLS]", 1], ["[SEP]", 2]]
));
// END bert_setup_processor
// START bert_train_tokenizer
let { wordPieceTrainer } = require("tokenizers");
let trainer = wordPieceTrainer({
vocabSize: 30522,
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
});
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
bertTokenizer.train(files, trainer);
bertTokenizer.save("data/bert-wiki.json")
// END bert_train_tokenizer
});
it("shows a full bert example", async () => {
let { Tokenizer } = require("tokenizers");
let bertTokenizer = await Tokenizer.fromFile("data/bert-wiki.json")
// START bert_test_decoding
let output = await bertTokenizer.encode("Welcome to the 🤗 Tokenizers library.");
console.log(output.getTokens());
// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
var decoded = await bertTokenizer.decode(output.getIds(), true);
// "welcome to the tok ##eni ##zer ##s library ."
// END bert_test_decoding
expect(decoded).toEqual("welcome to the tok ##eni ##zer ##s library .");
// START bert_proper_decoding
let { wordPieceDecoder } = require("tokenizers");
bertTokenizer.setDecoder(wordPieceDecoder());
var decoded = await bertTokenizer.decode(output.getIds(), true);
// "welcome to the tokenizers library."
// END bert_proper_decoding
expect(decoded).toEqual("welcome to the tokenizers library.");
});
});
|