File size: 774 Bytes
72c0672 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | use tokenizers::models::wordpiece::WordPiece;
use tokenizers::{AddedToken, Tokenizer};
fn main() {
let start = std::time::Instant::now();
let mut tokenizer = Tokenizer::new(WordPiece::default());
// Mix special and not special
// You can make sure ids are in order, and special status is correct.
let tokens: Vec<_> = (0..120_000)
.map(|i| AddedToken::from(format!("[SPECIAL_{i}]"), i % 2 == 0))
.collect();
tokenizer.add_tokens(&tokens);
tokenizer.save("_tok.json", true).unwrap();
println!("Save took {:?}", start.elapsed());
let start = std::time::Instant::now();
let _tok = Tokenizer::from_file("_tok.json").unwrap();
println!("Took {:?}", start.elapsed());
std::fs::remove_file("_tok.json").unwrap();
}
|