2ira's picture
offline_compression_graph_code
72c0672 verified
use tokenizers::models::wordpiece::WordPiece;
use tokenizers::{AddedToken, Tokenizer};
fn main() {
let start = std::time::Instant::now();
let mut tokenizer = Tokenizer::new(WordPiece::default());
// Mix special and not special
// You can make sure ids are in order, and special status is correct.
let tokens: Vec<_> = (0..120_000)
.map(|i| AddedToken::from(format!("[SPECIAL_{i}]"), i % 2 == 0))
.collect();
tokenizer.add_tokens(&tokens);
tokenizer.save("_tok.json", true).unwrap();
println!("Save took {:?}", start.elapsed());
let start = std::time::Instant::now();
let _tok = Tokenizer::from_file("_tok.json").unwrap();
println!("Took {:?}", start.elapsed());
std::fs::remove_file("_tok.json").unwrap();
}