offline_compression_graph_code

72c0672 verified 4 months ago

5.54 kB

	#![warn(clippy::all)]
	#![allow(clippy::upper_case_acronyms)]
	#![doc(html_favicon_url = "https://huggingface.co/favicon.ico")]
	#![doc(html_logo_url = "https://huggingface.co/landing/assets/huggingface_logo.svg")]

	//! The core of `tokenizers`, written in Rust.
	//! Provides an implementation of today's most used tokenizers, with a focus on performance and
	//! versatility.
	//!
	//! # What is a Tokenizer
	//!
	//! A Tokenizer works as a pipeline, it processes some raw text as input and outputs an `Encoding`.
	//! The various steps of the pipeline are:
	//!
	//! 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are
	//! the [unicode normalization standards](https://unicode.org/reports/tr15/#Norm_Forms), such as `NFD` or `NFKC`.
	//! More details about how to use the `Normalizers` are available on the
	//! [Hugging Face blog](https://huggingface.co/docs/tokenizers/components#normalizers)
	//! 2. The `PreTokenizer`: in charge of creating initial words splits in the text. The most common way of
	//! splitting text is simply on whitespace.
	//! 3. The `Model`: in charge of doing the actual tokenization. An example of a `Model` would be
	//! `BPE` or `WordPiece`.
	//! 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
	//! that, for example, a language model would need, such as special tokens.
	//!
	//! ## Loading a pretrained tokenizer from the Hub
	//! ```
	//! use tokenizers::tokenizer::{Result, Tokenizer};
	//!
	//! fn main() -> Result<()> {
	//! # #[cfg(feature = "http")]
	//! # {
	//! let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None)?;
	//!
	//! let encoding = tokenizer.encode("Hey there!", false)?;
	//! println!("{:?}", encoding.get_tokens());
	//! # }
	//! Ok(())
	//! }
	//! ```
	//!
	//! ## Deserialization and tokenization example
	//!
	//! ```no_run
	//! use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput};
	//! use tokenizers::models::bpe::BPE;
	//!
	//! fn main() -> Result<()> {
	//! let bpe_builder = BPE::from_file("./path/to/vocab.json", "./path/to/merges.txt");
	//! let bpe = bpe_builder
	//! .dropout(0.1)
	//! .unk_token("[UNK]".into())
	//! .build()?;
	//!
	//! let mut tokenizer = Tokenizer::new(bpe);
	//!
	//! let encoding = tokenizer.encode("Hey there!", false)?;
	//! println!("{:?}", encoding.get_tokens());
	//!
	//! Ok(())
	//! }
	//! ```
	//!
	//! ## Training and serialization example
	//!
	//! ```no_run
	//! use tokenizers::decoders::DecoderWrapper;
	//! use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
	//! use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence, NormalizerWrapper};
	//! use tokenizers::pre_tokenizers::byte_level::ByteLevel;
	//! use tokenizers::pre_tokenizers::PreTokenizerWrapper;
	//! use tokenizers::processors::PostProcessorWrapper;
	//! use tokenizers::{AddedToken, Model, Result, TokenizerBuilder};
	//!
	//! use std::path::Path;
	//!
	//! fn main() -> Result<()> {
	//! let vocab_size: usize = 100;
	//!
	//! let mut trainer = BpeTrainerBuilder::new()
	//! .show_progress(true)
	//! .vocab_size(vocab_size)
	//! .min_frequency(0)
	//! .special_tokens(vec![
	//! AddedToken::from(String::from("<s>"), true),
	//! AddedToken::from(String::from("<pad>"), true),
	//! AddedToken::from(String::from("</s>"), true),
	//! AddedToken::from(String::from("<unk>"), true),
	//! AddedToken::from(String::from("<mask>"), true),
	//! ])
	//! .build();
	//!
	//! let mut tokenizer = TokenizerBuilder::new()
	//! .with_model(BPE::default())
	//! .with_normalizer(Some(Sequence::new(vec![
	//! Strip::new(true, true).into(),
	//! NFC.into(),
	//! ])))
	//! .with_pre_tokenizer(Some(ByteLevel::default()))
	//! .with_post_processor(Some(ByteLevel::default()))
	//! .with_decoder(Some(ByteLevel::default()))
	//! .build()?;
	//!
	//! let pretty = false;
	//! tokenizer
	//! .train_from_files(
	//! &mut trainer,
	//! vec!["path/to/vocab.txt".to_string()],
	//! )?
	//! .save("tokenizer.json", pretty)?;
	//!
	//! Ok(())
	//! }
	//! ```
	//!
	//! # Additional information
	//!
	//! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
	//! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
	//! environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
	//! _Please note this behavior may evolve in the future_
	//!
	//! # Features
	//! progressbar: The progress bar visualization is enabled by default. It might be disabled if
	//! compilation for certain targets is not supported by the [termios](https://crates.io/crates/termios)
	//! dependency of the [indicatif](https://crates.io/crates/indicatif) progress bar.

	#[macro_use]
	extern crate log;
	#[macro_use]
	extern crate lazy_static;

	#[macro_use]
	extern crate derive_builder;

	#[macro_use]
	pub mod utils;
	pub mod decoders;
	pub mod models;
	pub mod normalizers;
	pub mod pre_tokenizers;
	pub mod processors;
	pub mod tokenizer;

	// Re-export from tokenizer
	pub use tokenizer::*;

	// Re-export also parallelism utils
	pub use utils::parallelism;

	// Re-export for from_pretrained
	#[cfg(feature = "http")]
	pub use utils::from_pretrained::FromPretrainedParameters;