#![warn(clippy::all)] #![allow(clippy::upper_case_acronyms)] #![doc(html_favicon_url = "https://huggingface.co/favicon.ico")] #![doc(html_logo_url = "https://huggingface.co/landing/assets/huggingface_logo.svg")] //! The core of `tokenizers`, written in Rust. //! Provides an implementation of today's most used tokenizers, with a focus on performance and //! versatility. //! //! # What is a Tokenizer //! //! A Tokenizer works as a pipeline, it processes some raw text as input and outputs an `Encoding`. //! The various steps of the pipeline are: //! //! 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are //! the [unicode normalization standards](https://unicode.org/reports/tr15/#Norm_Forms), such as `NFD` or `NFKC`. //! More details about how to use the `Normalizers` are available on the //! [Hugging Face blog](https://huggingface.co/docs/tokenizers/components#normalizers) //! 2. The `PreTokenizer`: in charge of creating initial words splits in the text. The most common way of //! splitting text is simply on whitespace. //! 3. The `Model`: in charge of doing the actual tokenization. An example of a `Model` would be //! `BPE` or `WordPiece`. //! 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant //! that, for example, a language model would need, such as special tokens. //! //! ## Loading a pretrained tokenizer from the Hub //! ``` //! use tokenizers::tokenizer::{Result, Tokenizer}; //! //! fn main() -> Result<()> { //! # #[cfg(feature = "http")] //! # { //! let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None)?; //! //! let encoding = tokenizer.encode("Hey there!", false)?; //! println!("{:?}", encoding.get_tokens()); //! # } //! Ok(()) //! } //! ``` //! //! ## Deserialization and tokenization example //! //! ```no_run //! use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput}; //! use tokenizers::models::bpe::BPE; //! //! fn main() -> Result<()> { //! let bpe_builder = BPE::from_file("./path/to/vocab.json", "./path/to/merges.txt"); //! let bpe = bpe_builder //! .dropout(0.1) //! .unk_token("[UNK]".into()) //! .build()?; //! //! let mut tokenizer = Tokenizer::new(bpe); //! //! let encoding = tokenizer.encode("Hey there!", false)?; //! println!("{:?}", encoding.get_tokens()); //! //! Ok(()) //! } //! ``` //! //! ## Training and serialization example //! //! ```no_run //! use tokenizers::decoders::DecoderWrapper; //! use tokenizers::models::bpe::{BpeTrainerBuilder, BPE}; //! use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence, NormalizerWrapper}; //! use tokenizers::pre_tokenizers::byte_level::ByteLevel; //! use tokenizers::pre_tokenizers::PreTokenizerWrapper; //! use tokenizers::processors::PostProcessorWrapper; //! use tokenizers::{AddedToken, Model, Result, TokenizerBuilder}; //! //! use std::path::Path; //! //! fn main() -> Result<()> { //! let vocab_size: usize = 100; //! //! let mut trainer = BpeTrainerBuilder::new() //! .show_progress(true) //! .vocab_size(vocab_size) //! .min_frequency(0) //! .special_tokens(vec![ //! AddedToken::from(String::from(""), true), //! AddedToken::from(String::from(""), true), //! AddedToken::from(String::from(""), true), //! AddedToken::from(String::from(""), true), //! AddedToken::from(String::from(""), true), //! ]) //! .build(); //! //! let mut tokenizer = TokenizerBuilder::new() //! .with_model(BPE::default()) //! .with_normalizer(Some(Sequence::new(vec![ //! Strip::new(true, true).into(), //! NFC.into(), //! ]))) //! .with_pre_tokenizer(Some(ByteLevel::default())) //! .with_post_processor(Some(ByteLevel::default())) //! .with_decoder(Some(ByteLevel::default())) //! .build()?; //! //! let pretty = false; //! tokenizer //! .train_from_files( //! &mut trainer, //! vec!["path/to/vocab.txt".to_string()], //! )? //! .save("tokenizer.json", pretty)?; //! //! Ok(()) //! } //! ``` //! //! # Additional information //! //! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined //! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS` //! environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads. //! **_Please note this behavior may evolve in the future_** //! //! # Features //! **progressbar**: The progress bar visualization is enabled by default. It might be disabled if //! compilation for certain targets is not supported by the [termios](https://crates.io/crates/termios) //! dependency of the [indicatif](https://crates.io/crates/indicatif) progress bar. #[macro_use] extern crate log; #[macro_use] extern crate lazy_static; #[macro_use] extern crate derive_builder; #[macro_use] pub mod utils; pub mod decoders; pub mod models; pub mod normalizers; pub mod pre_tokenizers; pub mod processors; pub mod tokenizer; // Re-export from tokenizer pub use tokenizer::*; // Re-export also parallelism utils pub use utils::parallelism; // Re-export for from_pretrained #[cfg(feature = "http")] pub use utils::from_pretrained::FromPretrainedParameters;