File size: 6,126 Bytes
72c0672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* eslint-disable */
var globRequire = require;

describe("pipelineExample", () => {
    // This is a hack to let us require using path similar to what the user has to use
    function require(mod: string) {
        if (mod.startsWith("tokenizers")) {
            // let path = mod.slice("tokenizers".length);
            return globRequire("../../");
        } else {
            return globRequire(mod);
        }
    }
    let console = {
        log: (..._args: any[]) => {}
    };

    it("shows pipeline parts", async () => {
        // START reload_tokenizer
        let { Tokenizer } = require("tokenizers");

        let tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
        // END reload_tokenizer
        // START setup_normalizer
        let { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers");

        let normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
        // END setup_normalizer
        // START test_normalizer
        let normalized = normalizer.normalizeString("Héllò hôw are ü?")
        // "Hello how are u?"
        // END test_normalizer
        expect(normalized).toEqual("Hello how are u?");
        // START replace_normalizer
        tokenizer.setNormalizer(normalizer)
        // END replace_normalizer
        // START setup_pre_tokenizer
        let { whitespacePreTokenizer } = require("tokenizers");

        var preTokenizer = whitespacePreTokenizer();
        var preTokenized = preTokenizer.preTokenizeString("Hello! How are you? I'm fine, thank you.");
        // END setup_pre_tokenizer
        expect(preTokenized).toEqual([
            ["Hello", [0, 5]],
            ["!", [5, 6]],
            ["How", [7, 10]],
            ["are", [11, 14]],
            ["you", [15, 18]],
            ["?", [18, 19]],
            ["I", [20, 21]],
            ["'", [21, 22]],
            ['m', [22, 23]],
            ["fine", [24, 28]],
            [",", [28, 29]],
            ["thank", [30, 35]],
            ["you", [36, 39]],
            [".", [39, 40]]
        ]);
        // START combine_pre_tokenizer
        let { sequencePreTokenizer, digitsPreTokenizer } = require("tokenizers");

        var preTokenizer = sequencePreTokenizer([whitespacePreTokenizer(), digitsPreTokenizer(true)]);
        var preTokenized = preTokenizer.preTokenizeString("Call 911!");
        // END combine_pre_tokenizer
        // START replace_pre_tokenizer
        tokenizer.setPreTokenizer(preTokenizer)
        // END replace_pre_tokenizer
        // START setup_processor
        let { templateProcessing } = require("tokenizers");

        tokenizer.setPostProcessor(templateProcessing(
            "[CLS] $A [SEP]",
            "[CLS] $A [SEP] $B:1 [SEP]:1",
            [["[CLS]", 1], ["[SEP]", 2]]
        ));
        // END setup_processor
        // START test_decoding
        let output = await tokenizer.encode("Hello, y'all! How are you 😁 ?");
        console.log(output.getIds());
        // [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]

        let decoded = await tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2], true);
        // "Hello , y ' all ! How are you ?"
        // END test_decoding
        expect(decoded).toEqual("Hello , y ' all ! How are you ?");
    });

    it.skip("trains the tokenizer", async () => {
        // START bert_setup_tokenizer
        let { Tokenizer } = require("tokenizers");
        let { WordPiece } = require("tokenizers");

        let bertTokenizer = new Tokenizer(WordPiece.init({}, { unkToken: "[UNK]" }));
        // END bert_setup_tokenizer
        // START bert_setup_normalizer
        let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
            = require("tokenizers");

        bertTokenizer.setNormalizer(sequenceNormalizer([
            nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer()
        ]))
        // END bert_setup_normalizer
        // START bert_setup_pre_tokenizer
        let { whitespacePreTokenizer } = require("tokenizers");

        bertTokenizer.setPreTokenizer(whitespacePreTokenizer());
        // END bert_setup_pre_tokenizer
        // START bert_setup_processor
        let { templateProcessing } = require("tokenizers");

        bertTokenizer.setPostProcessor(templateProcessing(
            "[CLS] $A [SEP]",
            "[CLS] $A [SEP] $B:1 [SEP]:1",
            [["[CLS]", 1], ["[SEP]", 2]]
        ));
        // END bert_setup_processor
        // START bert_train_tokenizer
        let { wordPieceTrainer } = require("tokenizers");

        let trainer = wordPieceTrainer({
            vocabSize: 30522,
            specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
        });
        let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
        bertTokenizer.train(files, trainer);

        bertTokenizer.save("data/bert-wiki.json")
        // END bert_train_tokenizer
    });

    it("shows a full bert example", async () => {
        let { Tokenizer } = require("tokenizers");
        let bertTokenizer = await Tokenizer.fromFile("data/bert-wiki.json")

        // START bert_test_decoding

        let output = await bertTokenizer.encode("Welcome to the 🤗 Tokenizers library.");
        console.log(output.getTokens());
        // ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]

        var decoded = await bertTokenizer.decode(output.getIds(), true);
        // "welcome to the tok ##eni ##zer ##s library ."
        // END bert_test_decoding
        expect(decoded).toEqual("welcome to the tok ##eni ##zer ##s library .");
        // START bert_proper_decoding
        let { wordPieceDecoder } = require("tokenizers");
        bertTokenizer.setDecoder(wordPieceDecoder());
        var decoded = await bertTokenizer.decode(output.getIds(), true);
        // "welcome to the tokenizers library."
        // END bert_proper_decoding
        expect(decoded).toEqual("welcome to the tokenizers library.");
    });
});