File size: 774 Bytes
72c0672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
use tokenizers::models::wordpiece::WordPiece;
use tokenizers::{AddedToken, Tokenizer};

fn main() {
    let start = std::time::Instant::now();
    let mut tokenizer = Tokenizer::new(WordPiece::default());

    // Mix special and not special
    // You can make sure ids are in order, and special status is correct.
    let tokens: Vec<_> = (0..120_000)
        .map(|i| AddedToken::from(format!("[SPECIAL_{i}]"), i % 2 == 0))
        .collect();
    tokenizer.add_tokens(&tokens);
    tokenizer.save("_tok.json", true).unwrap();
    println!("Save took {:?}", start.elapsed());
    let start = std::time::Instant::now();
    let _tok = Tokenizer::from_file("_tok.json").unwrap();
    println!("Took {:?}", start.elapsed());
    std::fs::remove_file("_tok.json").unwrap();
}