Byte-lingua-code / superbpe /tokenizers_superbpe /bindings /node /examples /documentation /quicktour.test.ts
| /* eslint-disable */ | |
| var globRequire = require | |
| console.log = (..._args: any[]) => {} | |
| describe('quicktourExample', () => { | |
| function require(mod: string) { | |
| if (mod.startsWith('tokenizers')) { | |
| return globRequire('../../') | |
| } else { | |
| return globRequire(mod) | |
| } | |
| } | |
| it.skip('trains the tokenizer', async () => { | |
| // START init_tokenizer | |
| let { Tokenizer } = require('tokenizers') | |
| let { BPE } = require('tokenizers') | |
| let tokenizer = new Tokenizer(BPE.init({}, [], { unkToken: '[UNK]' })) | |
| // END init_tokenizer | |
| // START init_trainer | |
| let { bpeTrainer } = require('tokenizers') | |
| let trainer = bpeTrainer({ | |
| specialTokens: ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'], | |
| }) | |
| // END init_trainer | |
| // START init_pretok | |
| let { whitespacePreTokenizer } = require('tokenizers') | |
| tokenizer.setPreTokenizer(whitespacePreTokenizer()) | |
| // END init_pretok | |
| // START train | |
| let files = ['test', 'train', 'valid'].map((split) => `data/wikitext-103-raw/wiki.${split}.raw`) | |
| tokenizer.train(files, trainer) | |
| // END train | |
| // START save | |
| tokenizer.save('data/tokenizer-wiki.json') | |
| // END save | |
| }) | |
| it('shows a quicktour example', async () => { | |
| let { Tokenizer } = require('tokenizers') | |
| // START reload_tokenizer | |
| let tokenizer = Tokenizer.fromFile('data/tokenizer-wiki.json') | |
| // END reload_tokenizer | |
| // START encode | |
| var output = await tokenizer.encode("Hello, y'all! How are you π ?") | |
| // END encode | |
| // START print_tokens | |
| console.log(output.getTokens()) | |
| // ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"] | |
| // END print_tokens | |
| expect(output.getTokens()).toEqual(['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?']) | |
| // START print_ids | |
| console.log(output.getIds()) | |
| // [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35] | |
| // END print_ids | |
| expect(output.getIds()).toEqual([27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]) | |
| // START print_offsets | |
| let offsets = output.getOffsets() | |
| console.log(offsets[9]) | |
| // (26, 27) | |
| // END print_offsets | |
| expect(offsets[9]).toEqual([26, 27]) | |
| // START use_offsets | |
| let { slice } = require('tokenizers') | |
| let sentence = "Hello, y'all! How are you π ?" | |
| let [start, end] = offsets[9] | |
| console.log(slice(sentence, start, end)) | |
| // "π" | |
| // END use_offsets | |
| expect(slice(sentence, start, end)).toEqual('π') | |
| // START check_sep | |
| console.log(tokenizer.tokenToId('[SEP]')) | |
| // 2 | |
| // END check_sep | |
| expect(tokenizer.tokenToId('[SEP]')).toEqual(2) | |
| // START init_template_processing | |
| let { templateProcessing } = require('tokenizers') | |
| tokenizer.setPostProcessor( | |
| templateProcessing('[CLS] $A [SEP]', '[CLS] $A [SEP] $B:1 [SEP]:1', [ | |
| ['[CLS]', tokenizer.tokenToId('[CLS]')], | |
| ['[SEP]', tokenizer.tokenToId('[SEP]')], | |
| ]), | |
| ) | |
| // END init_template_processing | |
| // START print_special_tokens | |
| var output = await tokenizer.encode("Hello, y'all! How are you π ?") | |
| console.log(output.getTokens()) | |
| // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"] | |
| // END print_special_tokens | |
| expect(output.getTokens()).toEqual([ | |
| '[CLS]', | |
| 'Hello', | |
| ',', | |
| 'y', | |
| "'", | |
| 'all', | |
| '!', | |
| 'How', | |
| 'are', | |
| 'you', | |
| '[UNK]', | |
| '?', | |
| '[SEP]', | |
| ]) | |
| // START print_special_tokens_pair | |
| var output = await tokenizer.encode("Hello, y'all!", 'How are you π ?') | |
| console.log(output.getTokens()) | |
| // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"] | |
| // END print_special_tokens_pair | |
| expect(output.getTokens()).toEqual([ | |
| '[CLS]', | |
| 'Hello', | |
| ',', | |
| 'y', | |
| "'", | |
| 'all', | |
| '!', | |
| '[SEP]', | |
| 'How', | |
| 'are', | |
| 'you', | |
| '[UNK]', | |
| '?', | |
| '[SEP]', | |
| ]) | |
| // START print_type_ids | |
| console.log(output.getTypeIds()) | |
| // [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] | |
| // END print_type_ids | |
| expect(output.getTypeIds()).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) | |
| // START encode_batch | |
| var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you π ?']) | |
| // END encode_batch | |
| // START encode_batch_pair | |
| // var output = await tokenizer.encodeBatch( | |
| // [["Hello, y'all!", "How are you π ?"], ["Hello to you too!", "I'm fine, thank you!"]] | |
| // ); | |
| // END encode_batch_pair | |
| // START enable_padding | |
| tokenizer.setPadding({ padId: 3, padToken: '[PAD]' }) | |
| // END enable_padding | |
| // START print_batch_tokens | |
| var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you π ?']) | |
| console.log(output[1].getTokens()) | |
| // ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"] | |
| // END print_batch_tokens | |
| expect(output[1].getTokens()).toEqual(['[CLS]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', '[PAD]']) | |
| // START print_attention_mask | |
| console.log(output[1].getAttentionMask()) | |
| // [1, 1, 1, 1, 1, 1, 1, 0] | |
| // END print_attention_mask | |
| expect(output[1].getAttentionMask()).toEqual([1, 1, 1, 1, 1, 1, 1, 0]) | |
| }) | |
| }) | |