| use tokenizers::Tokenizer; | |
| fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> { | |
| let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?; | |
| let data = std::fs::read_to_string("data/big.txt")?; | |
| let data: Vec<_> = data.lines().collect(); | |
| let add_special_tokens = false; | |
| tokenizer.encode_batch_char_offsets(data, add_special_tokens)?; | |
| Ok(()) | |
| } | |