2ira's picture
offline_compression_graph_code
72c0672 verified
//! Represents a tokenization pipeline.
//!
//! A [`Tokenizer`](struct.Tokenizer.html) is composed of some of the following parts.
//! - [`Normalizer`](trait.Normalizer.html): Takes care of the text normalization (like unicode normalization).
//! - [`PreTokenizer`](trait.PreTokenizer.html): Takes care of the pre tokenization (ie. How to split tokens and pre-process
//! them.
//! - [`Model`](trait.Model.html): A model encapsulates the tokenization algorithm (like BPE, Word base, character
//! based, ...).
//! - [`PostProcessor`](trait.PostProcessor.html): Takes care of the processing after tokenization (like truncating, padding,
//! ...).
use std::{
collections::HashMap,
fs::{read_to_string, File},
io::prelude::*,
io::BufReader,
ops::{Deref, DerefMut},
path::{Path, PathBuf},
};
use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize};
use crate::utils::iter::ResultShunt;
use crate::utils::parallelism::*;
use crate::utils::progress::{ProgressBar, ProgressStyle};
mod added_vocabulary;
mod encoding;
pub mod normalizer;
pub mod pattern;
pub mod pre_tokenizer;
mod serialization;
// Re-export wrappers
pub use crate::decoders::DecoderWrapper;
pub use crate::models::ModelWrapper;
pub use crate::normalizers::NormalizerWrapper;
pub use crate::pre_tokenizers::PreTokenizerWrapper;
pub use crate::processors::PostProcessorWrapper;
// And some other types
pub use crate::utils::iter::LinesWithEnding;
pub use crate::utils::padding::{pad_encodings, PaddingDirection, PaddingParams, PaddingStrategy};
pub use crate::utils::truncation::{
truncate_encodings, TruncationDirection, TruncationParams, TruncationStrategy,
};
pub use added_vocabulary::*;
pub use encoding::*;
pub use normalizer::{NormalizedString, OffsetReferential, SplitDelimiterBehavior};
pub use pre_tokenizer::*;
pub type Error = Box<dyn std::error::Error + Send + Sync>;
pub type Result<T> = std::result::Result<T, Error>;
pub type Offsets = (usize, usize);
/// Takes care of pre-processing strings.
pub trait Normalizer {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;
}
/// The `PreTokenizer` is in charge of doing the pre-segmentation step. It splits the given string
/// in multiple substrings, keeping track of the offsets of said substrings from the
/// `NormalizedString`. In some occasions, the `PreTokenizer` might need to modify the given
/// `NormalizedString` to ensure we can entirely keep track of the offsets and the mapping with
/// the original string.
pub trait PreTokenizer {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()>;
}
/// Represents a model used during Tokenization (like BPE or Word or Unigram).
pub trait Model {
type Trainer: Trainer + Sync;
/// Tokenize the given sequence into multiple underlying `Token`. The `offsets` on the `Token`
/// are expected to be relative to the given sequence.
fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>;
/// Find the ID associated to a string token
fn token_to_id(&self, token: &str) -> Option<u32>;
/// Find the string token associated to an ID
fn id_to_token(&self, id: u32) -> Option<String>;
/// Retrieve the entire vocabulary mapping (token -> ID)
fn get_vocab(&self) -> HashMap<String, u32>;
/// Retrieve the size of the vocabulary
fn get_vocab_size(&self) -> usize;
/// Save the current `Model` in the given folder, using the given `prefix` for the various
/// files that need to be saved.
fn save(&self, folder: &Path, prefix: Option<&str>) -> Result<Vec<PathBuf>>;
/// Get an instance of a Trainer capable of training this Model
fn get_trainer(&self) -> <Self as Model>::Trainer;
}
/// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`.
/// It adds any special tokens that a language model would require.
pub trait PostProcessor {
/// Returns the number of tokens that will be added during the processing step
fn added_tokens(&self, is_pair: bool) -> usize;
/// Process both encodings and returns a new merged one
fn process(
&self,
encoding: Encoding,
pair_encoding: Option<Encoding>,
add_special_tokens: bool,
) -> Result<Encoding> {
let mut encodings = if let Some(pair_encoding) = pair_encoding {
vec![encoding, pair_encoding]
} else {
vec![encoding]
};
encodings.iter_mut().enumerate().for_each(|(i, encoding)| {
encoding.set_sequence_id(i);
encoding
.get_overflowing_mut()
.iter_mut()
.for_each(|encoding| encoding.set_sequence_id(i));
encoding.set_type_ids(vec![i as u32; encoding.len()]);
});
let encodings = self.process_encodings(encodings, add_special_tokens)?;
Ok(Encoding::merge(encodings, false))
}
/// Process any amount of encodings and returns a series of encoding (might merge them)
fn process_encodings(
&self,
encodings: Vec<Encoding>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>;
}
impl dyn PostProcessor {
pub fn default_process(
encodings: Vec<Encoding>,
_add_special_tokens: bool,
) -> Result<Vec<Encoding>> {
match encodings.len() {
1 => Ok(encodings),
_ => {
let mut final_encoding = Encoding::default();
for (i, mut encoding) in encodings.into_iter().enumerate() {
encoding.set_sequence_id(i);
final_encoding.merge_with(encoding, false);
}
Ok(vec![final_encoding])
}
}
}
}
#[derive(thiserror::Error, Debug)]
pub enum ProcessorError {
#[error("encodings vector length must be either 1 or 2")]
InvalidEncodingsVecLength,
}
/// A `Decoder` changes the raw tokens into its more readable form.
pub trait Decoder {
fn decode(&self, tokens: Vec<String>) -> Result<String> {
let results = self.decode_chain(tokens)?;
Ok(results.join(""))
}
fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>>;
}
/// A `Trainer` has the responsibility to train a model. We feed it with lines/sentences
/// and then it can train the given `Model`.
pub trait Trainer {
type Model: Model + Sized;
/// Whether we should show progress during the training.
fn should_show_progress(&self) -> bool;
/// The actual training method. This will return a new trained Model as well as a list
/// of `special_tokens` to be added directly to the tokenizer along with the model.
fn train(&self, model: &mut Self::Model) -> Result<Vec<AddedToken>>;
/// Process an iterator of sequences, calling `process` for each of them in order to
/// pre-process the said sequence as relevant.
fn feed<I, S, F>(&mut self, iterator: I, process: F) -> Result<()>
where
I: Iterator<Item = S> + Send,
S: AsRef<str> + Send,
F: Fn(&str) -> Result<Vec<String>> + Sync;
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub id: u32,
pub value: String,
pub offsets: (usize, usize),
}
impl Token {
pub fn new(id: u32, value: String, offsets: (usize, usize)) -> Self {
Self { id, value, offsets }
}
}
use std::borrow::Cow;
#[derive(Debug, Clone)]
pub enum InputSequence<'s> {
Raw(Cow<'s, str>),
PreTokenized(Cow<'s, [&'s str]>),
PreTokenizedOwned(Cow<'s, [String]>),
PreTokenizedCow(Cow<'s, [Cow<'s, str>]>),
}
impl<'s> From<Cow<'s, str>> for InputSequence<'s> {
fn from(input: Cow<'s, str>) -> Self {
Self::Raw(input)
}
}
impl<'s> From<&'s str> for InputSequence<'s> {
fn from(input: &'s str) -> Self {
Self::Raw(Cow::Borrowed(input))
}
}
impl From<String> for InputSequence<'_> {
fn from(input: String) -> Self {
Self::Raw(Cow::Owned(input))
}
}
impl<'s> From<&'s [&'s str]> for InputSequence<'s> {
fn from(input: &'s [&'s str]) -> Self {
Self::PreTokenized(Cow::Borrowed(input))
}
}
impl<'s> From<Vec<&'s str>> for InputSequence<'s> {
fn from(input: Vec<&'s str>) -> Self {
Self::PreTokenized(Cow::Owned(input))
}
}
impl<'s> From<&'s [String]> for InputSequence<'s> {
fn from(input: &'s [String]) -> Self {
Self::PreTokenizedOwned(Cow::Borrowed(input))
}
}
impl<'s> From<Vec<String>> for InputSequence<'s> {
fn from(input: Vec<String>) -> Self {
Self::PreTokenizedOwned(Cow::Owned(input))
}
}
impl<'s> From<Vec<Cow<'s, str>>> for InputSequence<'s> {
fn from(input: Vec<Cow<'s, str>>) -> Self {
Self::PreTokenizedCow(Cow::Owned(input))
}
}
impl<'s> From<&'s [Cow<'s, str>]> for InputSequence<'s> {
fn from(input: &'s [Cow<'s, str>]) -> Self {
Self::PreTokenizedCow(Cow::Borrowed(input))
}
}
#[derive(Debug, Clone)]
pub enum EncodeInput<'s> {
Single(InputSequence<'s>),
Dual(InputSequence<'s>, InputSequence<'s>),
}
impl<'s, I: Into<InputSequence<'s>>> From<I> for EncodeInput<'s> {
fn from(input: I) -> Self {
Self::Single(input.into())
}
}
impl<'s, I1, I2> From<(I1, I2)> for EncodeInput<'s>
where
I1: Into<InputSequence<'s>>,
I2: Into<InputSequence<'s>>,
{
fn from(input: (I1, I2)) -> Self {
Self::Dual(input.0.into(), input.1.into())
}
}
#[derive(thiserror::Error, Debug)]
#[error("{0}")]
pub struct BuilderError(String);
/// Builder for Tokenizer structs.
///
/// `build()` fails if the `model` is missing.
pub struct TokenizerBuilder<M, N, PT, PP, D> {
model: Option<M>,
normalizer: Option<N>,
pre_tokenizer: Option<PT>,
post_processor: Option<PP>,
decoder: Option<D>,
added_vocabulary: AddedVocabulary,
truncation: Option<TruncationParams>,
padding: Option<PaddingParams>,
}
impl<M, N, PT, PP, D> Default for TokenizerBuilder<M, N, PT, PP, D>
where
M: Model,
N: Normalizer,
PT: PreTokenizer,
PP: PostProcessor,
D: Decoder,
{
fn default() -> Self {
Self::new()
}
}
impl<M, N, PT, PP, D> TokenizerBuilder<M, N, PT, PP, D>
where
M: Model,
N: Normalizer,
PT: PreTokenizer,
PP: PostProcessor,
D: Decoder,
{
/// Get an empty TokenizerBuilder.
pub fn new() -> Self {
Self {
model: None,
normalizer: None,
pre_tokenizer: None,
post_processor: None,
decoder: None,
added_vocabulary: AddedVocabulary::new(),
truncation: None,
padding: None,
}
}
/// Convert the TokenizerBuilder to a Tokenizer.
///
/// Conversion fails if the `model` is missing.
pub fn build(self) -> Result<TokenizerImpl<M, N, PT, PP, D>> {
let model = self
.model
.ok_or_else(|| Box::new(BuilderError("Model missing.".into())))?;
Ok(TokenizerImpl {
normalizer: self.normalizer,
pre_tokenizer: self.pre_tokenizer,
model,
post_processor: self.post_processor,
decoder: self.decoder,
added_vocabulary: self.added_vocabulary,
truncation: self.truncation,
padding: self.padding,
})
}
/// Set the model.
#[must_use]
pub fn with_model(mut self, model: M) -> Self {
self.model = Some(model);
self
}
/// Set the normalizer.
#[must_use]
pub fn with_normalizer(mut self, normalizer: Option<N>) -> Self {
self.normalizer = normalizer;
self
}
/// Set the pre-tokenizer.
#[must_use]
pub fn with_pre_tokenizer(mut self, pretokenizer: Option<PT>) -> Self {
self.pre_tokenizer = pretokenizer;
self
}
/// Set the post-processor.
#[must_use]
pub fn with_post_processor(mut self, post_processor: Option<PP>) -> Self {
self.post_processor = post_processor;
self
}
/// Set the decoder.
#[must_use]
pub fn with_decoder(mut self, decoder: Option<D>) -> Self {
self.decoder = decoder;
self
}
/// Set the added vocabulary.
pub fn with_added_vocabulary(mut self, added_vocabulary: AddedVocabulary) -> Self {
self.added_vocabulary = added_vocabulary;
self
}
/// Set the trunaction parameters.
#[must_use]
pub fn with_truncation(mut self, trunc: Option<TruncationParams>) -> Self {
self.truncation = trunc;
self
}
/// Set the padding parameters.
#[must_use]
pub fn with_padding(mut self, padding: Option<PaddingParams>) -> Self {
self.padding = padding;
self
}
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct Tokenizer(
TokenizerImpl<
ModelWrapper,
NormalizerWrapper,
PreTokenizerWrapper,
PostProcessorWrapper,
DecoderWrapper,
>,
);
impl Tokenizer {
/// Construct a new Tokenizer based on the model.
pub fn new(model: impl Into<ModelWrapper>) -> Self {
Self(TokenizerImpl::new(model.into()))
}
/// Unwrap the TokenizerImpl.
pub fn into_inner(
self,
) -> TokenizerImpl<
ModelWrapper,
NormalizerWrapper,
PreTokenizerWrapper,
PostProcessorWrapper,
DecoderWrapper,
> {
self.0
}
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self> {
let content = read_to_string(file)?;
let tokenizer = serde_json::from_str(&content)?;
Ok(tokenizer)
}
pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self> {
let tokenizer = serde_json::from_slice(bytes.as_ref())?;
Ok(tokenizer)
}
#[cfg(feature = "http")]
pub fn from_pretrained<S: AsRef<str>>(
identifier: S,
params: Option<crate::utils::from_pretrained::FromPretrainedParameters>,
) -> Result<Self> {
let tokenizer_file = crate::utils::from_pretrained::from_pretrained(identifier, params)?;
Tokenizer::from_file(tokenizer_file)
}
}
impl std::str::FromStr for Tokenizer {
type Err = Box<dyn std::error::Error + Send + Sync>;
fn from_str(s: &str) -> Result<Self> {
Ok(serde_json::from_str(s)?)
}
}
impl<M, N, PT, PP, D> From<TokenizerImpl<M, N, PT, PP, D>> for Tokenizer
where
M: Into<ModelWrapper>,
N: Into<NormalizerWrapper>,
PT: Into<PreTokenizerWrapper>,
PP: Into<PostProcessorWrapper>,
D: Into<DecoderWrapper>,
{
fn from(t: TokenizerImpl<M, N, PT, PP, D>) -> Self {
Self(TokenizerImpl {
model: t.model.into(),
normalizer: t.normalizer.map(Into::into),
pre_tokenizer: t.pre_tokenizer.map(Into::into),
post_processor: t.post_processor.map(Into::into),
decoder: t.decoder.map(Into::into),
added_vocabulary: t.added_vocabulary,
padding: t.padding,
truncation: t.truncation,
})
}
}
impl Deref for Tokenizer {
type Target = TokenizerImpl<
ModelWrapper,
NormalizerWrapper,
PreTokenizerWrapper,
PostProcessorWrapper,
DecoderWrapper,
>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for Tokenizer {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
#[derive(thiserror::Error, Debug)]
#[error("{0}")]
pub struct TruncationParamError(String);
/// A `Tokenizer` is capable of encoding/decoding any text.
#[derive(Clone, Debug)]
pub struct TokenizerImpl<M, N, PT, PP, D> {
// Tokenizer parts
normalizer: Option<N>,
pre_tokenizer: Option<PT>,
model: M,
post_processor: Option<PP>,
decoder: Option<D>,
// Added Vocabulary capabilities
added_vocabulary: AddedVocabulary,
// General processing parameters
truncation: Option<TruncationParams>,
padding: Option<PaddingParams>,
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
M: Model,
N: Normalizer,
PT: PreTokenizer,
PP: PostProcessor,
D: Decoder,
{
/// Instantiate a new Tokenizer, with the given Model
pub fn new(model: M) -> Self {
Self {
normalizer: None,
pre_tokenizer: None,
model,
post_processor: None,
decoder: None,
added_vocabulary: AddedVocabulary::new(),
truncation: None,
padding: None,
}
}
/// Set the normalizer
pub fn with_normalizer(&mut self, normalizer: Option<impl Into<N>>) -> &mut Self {
self.normalizer = normalizer.map(|norm| norm.into());
self
}
/// Get the normalizer
pub fn get_normalizer(&self) -> Option<&N> {
self.normalizer.as_ref()
}
/// Set the pre tokenizer
pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Option<impl Into<PT>>) -> &mut Self {
self.pre_tokenizer = pre_tokenizer.map(|tok| tok.into());
self
}
/// Get the pre tokenizer
pub fn get_pre_tokenizer(&self) -> Option<&PT> {
self.pre_tokenizer.as_ref()
}
/// Set the post processor
pub fn with_post_processor(&mut self, post_processor: Option<impl Into<PP>>) -> &mut Self {
self.post_processor = post_processor.map(|post_proc| post_proc.into());
self
}
/// Get the post processor
pub fn get_post_processor(&self) -> Option<&PP> {
self.post_processor.as_ref()
}
/// Set the decoder
pub fn with_decoder(&mut self, decoder: Option<impl Into<D>>) -> &mut Self {
self.decoder = decoder.map(|dec| dec.into());
self
}
/// Get the decoder
pub fn get_decoder(&self) -> Option<&D> {
self.decoder.as_ref()
}
/// Set the model
pub fn with_model(&mut self, model: impl Into<M>) -> &mut Self {
self.model = model.into();
self
}
/// Get the model
pub fn get_model(&self) -> &M {
&self.model
}
/// Set the added vocabulary.
pub fn with_added_vocabulary(&mut self, added_vocabulary: AddedVocabulary) -> &mut Self {
self.added_vocabulary = added_vocabulary;
self
}
/// Get the added vocabulary
pub fn get_added_vocabulary(&self) -> &AddedVocabulary {
&self.added_vocabulary
}
/// Set the truncation parameters
///
/// Fails if `stride` is too high relative to `max_length` and `post_processor.added_tokens()`
pub fn with_truncation(&mut self, trunc: Option<TruncationParams>) -> Result<&mut Self> {
if let Some(trunc_params) = &trunc {
let n_added_tokens = self.get_n_added_tokens(false);
let effective_max_length = trunc_params.max_length - n_added_tokens;
if effective_max_length < trunc_params.stride {
return Err(Box::new(TruncationParamError(format!(
"tokenizer stride set to {}, which is greater than or equal to its effective max length of {} (= {} original max length - {} added special tokens), ",
trunc_params.stride, effective_max_length, trunc_params.max_length, n_added_tokens
))));
}
}
self.truncation = trunc;
Ok(self)
}
/// Get the currently set truncation parameters
pub fn get_truncation(&self) -> Option<&TruncationParams> {
self.truncation.as_ref()
}
/// Get a mutable reference to the currently set truncation parameters
pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams> {
self.truncation.as_mut()
}
/// Set the padding parameters
pub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self {
self.padding = padding;
self
}
/// Get the currently set padding parameters
pub fn get_padding(&self) -> Option<&PaddingParams> {
self.padding.as_ref()
}
/// Get a mutable reference to the currently set padding parameters
pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams> {
self.padding.as_mut()
}
/// Get the vocabulary
pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32> {
let mut final_vocab = self.model.get_vocab();
if with_added_tokens {
let added_vocab = self.added_vocabulary.get_vocab();
if !added_vocab.is_empty() {
final_vocab.reserve(added_vocab.len());
for (token, id) in added_vocab {
final_vocab.insert(token.clone(), *id);
}
}
}
final_vocab
}
/// Get the added tokens decoder
pub fn get_added_tokens_decoder(&self) -> HashMap<u32, AddedToken> {
self.added_vocabulary.get_added_tokens_decoder().clone()
}
/// Get the size of the vocabulary
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
// TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because
// now some tokens can be both in the added_tokens_encoder and in the vocab
if with_added_tokens {
self.get_vocab(true).len()
} else {
self.model.get_vocab_size()
}
}
/// Converts a token in the corresponding id.
pub fn token_to_id(&self, token: &str) -> Option<u32> {
self.added_vocabulary.token_to_id(token, &self.model)
}
/// Converts an id to the corresponding token.
pub fn id_to_token(&self, id: u32) -> Option<String> {
self.added_vocabulary
.simple_id_to_token(id)
.or_else(|| self.model.id_to_token(id))
}
/// set the added bocab's splitting scheme
pub fn set_encode_special_tokens(&mut self, value: bool) {
self.added_vocabulary.set_encode_special_tokens(value);
}
/// Get added token value
pub fn get_encode_special_tokens(&self) -> bool {
self.added_vocabulary.get_encode_special_tokens()
}
/// Encode a single sequence
fn encode_single_sequence(
&self,
sequence: InputSequence,
type_id: u32,
offsets_type: OffsetType,
) -> Result<Encoding> {
let encode = |is_pre_tokenized, subseq_idx, subseq| -> Result<Encoding> {
let normalized = self
.added_vocabulary
.extract_and_normalize(self.normalizer.as_ref(), subseq);
let pre_tokenized = self.do_pre_tokenize(normalized)?;
let subseq_encoding = self.do_tokenize(
pre_tokenized,
type_id,
if is_pre_tokenized {
Some(subseq_idx as u32)
} else {
None
},
offsets_type,
)?;
Ok(subseq_encoding)
};
match sequence {
InputSequence::PreTokenized(seq) => seq
.iter()
.enumerate()
.map(|(i, sequence)| encode(true, i, sequence))
.collect(),
InputSequence::PreTokenizedOwned(seq) => seq
.iter()
.enumerate()
.map(|(i, sequence)| encode(true, i, sequence))
.collect(),
InputSequence::PreTokenizedCow(seq) => seq
.iter()
.enumerate()
.map(|(i, sequence)| encode(true, i, sequence))
.collect(),
InputSequence::Raw(seq) => encode(false, 0, seq.as_ref()),
}
}
/// Encode the given input. This method accepts both single sequences, as well as pair
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
/// Contrarily to `encode`, it does not compute offsets
/// ```
/// # use tokenizers::Tokenizer;
/// # use tokenizers::models::bpe::BPE;
/// # let mut tokenizer = Tokenizer::new(BPE::default());
/// #
/// // Sequences:
/// tokenizer.encode_fast("Single sequence", false);
/// tokenizer.encode_fast(("Sequence A", "Sequence B"), false);
///
/// // Pre-tokenized sequences:
/// tokenizer.encode_fast(&["Single", "sequence"][..], false);
/// tokenizer.encode_fast((
/// &["Sequence", "A"][..],
/// &["Sequence", "B"][..]
/// ), false);
///
/// // or even both types together:
/// tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);
/// ```
pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
where
E: Into<EncodeInput<'s>>,
{
// Extract sequences from the EncodeInput
let (sequence, pair) = match input.into() {
EncodeInput::Single(s1) => (s1, None),
EncodeInput::Dual(s1, s2) => (s1, Some(s2)),
};
// Encode each sequence
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::None)?;
let pair_encoding = pair
.map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::None))
.transpose()?;
// And finally post process
self.post_process(encoding, pair_encoding, add_special_tokens)
}
/// Encode the given input. This method accepts both single sequences, as well as pair
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
///
/// ```
/// # use tokenizers::Tokenizer;
/// # use tokenizers::models::bpe::BPE;
/// # let mut tokenizer = Tokenizer::new(BPE::default());
/// #
/// // Sequences:
/// tokenizer.encode("Single sequence", false);
/// tokenizer.encode(("Sequence A", "Sequence B"), false);
///
/// // Pre-tokenized sequences:
/// tokenizer.encode(&["Single", "sequence"][..], false);
/// tokenizer.encode((
/// &["Sequence", "A"][..],
/// &["Sequence", "B"][..]
/// ), false);
///
/// // or even both types together:
/// tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);
/// ```
pub fn encode<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
where
E: Into<EncodeInput<'s>>,
{
// Extract sequences from the EncodeInput
let (sequence, pair) = match input.into() {
EncodeInput::Single(s1) => (s1, None),
EncodeInput::Dual(s1, s2) => (s1, Some(s2)),
};
// Encode each sequence
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::Byte)?;
let pair_encoding = pair
.map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::Byte))
.transpose()?;
// And finally post process
self.post_process(encoding, pair_encoding, add_special_tokens)
}
/// Encode the given input, using offsets relative to chars instead of bytes.
/// This method accepts both single sequences, as well as pair sequences. Also,
/// a sequence can be a string, or already pre-tokenized input directly:
///
/// ```
/// # use tokenizers::Tokenizer;
/// # use tokenizers::models::bpe::BPE;
/// # let mut tokenizer = Tokenizer::new(BPE::default());
/// #
/// // Sequences:
/// tokenizer.encode("Single sequence", false);
/// tokenizer.encode(("Sequence A", "Sequence B"), false);
///
/// // Pre-tokenized sequences:
/// tokenizer.encode(&["Single", "sequence"][..], false);
/// tokenizer.encode((
/// &["Sequence", "A"][..],
/// &["Sequence", "B"][..]
/// ), false);
///
/// // or even both types together:
/// tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);
/// ```
pub fn encode_char_offsets<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
where
E: Into<EncodeInput<'s>>,
{
// Extract sequences from the EncodeInput
let (sequence, pair) = match input.into() {
EncodeInput::Single(s1) => (s1, None),
EncodeInput::Dual(s1, s2) => (s1, Some(s2)),
};
// Encode each sequence
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::Char)?;
let pair_encoding = pair
.map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::Char))
.transpose()?;
// And finally post process
self.post_process(encoding, pair_encoding, add_special_tokens)
}
/// Decode the given ids, back to a String
pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> {
let tokens = ids
.iter()
.filter_map(|id| {
self.added_vocabulary
.simple_id_to_token(*id)
.or_else(|| self.model.id_to_token(*id))
.filter(|token| {
!skip_special_tokens || !self.added_vocabulary.is_special_token(token)
})
})
.collect::<Vec<_>>();
if let Some(decoder) = &self.decoder {
decoder.decode(tokens)
} else {
Ok(tokens.join(" "))
}
}
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
M: Model,
{
/// Tokenization logic, makes the bridge between the pre-tokenization phase and the real
/// tokenization phase, and converting offsets back to the original referential.
fn do_tokenize<P: Into<PreTokenizedString>>(
&self,
pretokenized: P,
type_id: u32,
word_idx: Option<u32>,
offsets_type: OffsetType,
) -> Result<Encoding> {
let mut pretokenized: PreTokenizedString = pretokenized.into();
pretokenized.tokenize(|normalized| self.model.tokenize(normalized.get()))?;
pretokenized.into_encoding(word_idx, type_id, offsets_type)
}
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
N: Normalizer,
{
/// Normalization logic, go through all normalizers
fn do_normalize<V: Into<NormalizedString>>(&self, normalized: V) -> Result<NormalizedString> {
let mut normalized: NormalizedString = normalized.into();
if let Some(ref normalizer) = self.normalizer {
normalizer.normalize(&mut normalized)?;
}
Ok(normalized)
}
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
N: Normalizer,
M: Model,
{
/// Register the given tokens as special tokens. This is especially useful for removing
/// these special tokens while decoding
pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize {
self.added_vocabulary
.add_special_tokens(tokens, &self.model, self.normalizer.as_ref())
}
/// Add the given tokens to the added vocabulary
pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize {
self.added_vocabulary
.add_tokens(tokens, &self.model, self.normalizer.as_ref())
}
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
PT: PreTokenizer,
{
/// PreTokenization logic, handling the case where there is no PreTokenizer set
fn do_pre_tokenize<P: Into<PreTokenizedString>>(
&self,
pretokenized: P,
) -> Result<PreTokenizedString> {
let mut pretokenized: PreTokenizedString = pretokenized.into();
if let Some(ref pretok) = self.pre_tokenizer {
pretok.pre_tokenize(&mut pretokenized)?;
}
Ok(pretokenized)
}
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
PP: PostProcessor,
{
/// Post processing logic, handling the case where there is no PostProcessor set
pub fn post_process(
&self,
encoding: Encoding,
pair_encoding: Option<Encoding>,
add_special_tokens: bool,
) -> Result<Encoding> {
// 1. First we truncate if needed
let (encoding, pair_encoding) = {
if let Some(trunc) = &self.truncation {
let n_added_tokens = self.get_n_added_tokens(pair_encoding.is_some());
if add_special_tokens && n_added_tokens > 0 {
let params = TruncationParams {
max_length: trunc.max_length - n_added_tokens,
..*trunc
};
truncate_encodings(encoding, pair_encoding, &params)?
} else {
truncate_encodings(encoding, pair_encoding, trunc)?
}
} else {
(encoding, pair_encoding)
}
};
// 2. Then We post process
let final_encoding = if let Some(processor) = &self.post_processor {
processor.process(encoding, pair_encoding, add_special_tokens)?
} else {
let encodings = if let Some(pair_encoding) = pair_encoding {
vec![encoding, pair_encoding]
} else {
vec![encoding]
};
let mut encodings =
<dyn PostProcessor>::default_process(encodings, add_special_tokens)?;
if encodings.len() != 1 {
panic!("We haven't reduced the encodings like we should have");
}
encodings.pop().unwrap()
};
// 3. Then we pad if needed
let [final_encoding] = if let Some(params) = &self.padding {
let mut arr = [final_encoding];
pad_encodings(&mut arr, params)?;
arr
} else {
[final_encoding]
};
Ok(final_encoding)
}
fn get_n_added_tokens(&self, is_pair: bool) -> usize {
if let Some(processor) = &self.post_processor {
processor.added_tokens(is_pair)
} else {
0
}
}
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
M: Model + Send + Sync,
N: Normalizer + Send + Sync,
PT: PreTokenizer + Send + Sync,
PP: PostProcessor + Send + Sync,
D: Decoder + Send + Sync,
{
/// Encode all the sentences in parallel, using multiple threads
pub fn encode_batch<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
where
E: Into<EncodeInput<'s>> + Send,
{
let mut encodings = inputs
.into_maybe_par_iter()
.map(|input| self.encode(input, add_special_tokens))
.collect::<Result<Vec<Encoding>>>()?;
if let Some(params) = &self.padding {
// We do the padding here to make sure we handle the batch padding
pad_encodings(&mut encodings, params)?;
}
Ok(encodings)
}
/// Encode all the sentences in parallel, using multiple threads.
/// The offsets on each `Encoding` will be relative to chars instead of bytes.
pub fn encode_batch_char_offsets<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
where
E: Into<EncodeInput<'s>> + Send,
{
let mut encodings = inputs
.into_maybe_par_iter()
.map(|input| self.encode_char_offsets(input, add_special_tokens))
.collect::<Result<Vec<Encoding>>>()?;
if let Some(params) = &self.padding {
// We do the padding here to make sure we handle the batch padding
pad_encodings(&mut encodings, params)?;
}
Ok(encodings)
}
/// Encode all the sentences in parallel, using multiple threads
pub fn encode_batch_fast<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
where
E: Into<EncodeInput<'s>> + Send,
{
let mut encodings = inputs
.into_maybe_par_iter()
.map(|input| self.encode_fast(input, add_special_tokens))
.collect::<Result<Vec<Encoding>>>()?;
if let Some(params) = &self.padding {
// We do the padding here to make sure we handle the batch padding
pad_encodings(&mut encodings, params)?;
}
Ok(encodings)
}
/// Decode all sentences in parallel
pub fn decode_batch(
&self,
sentences: &[&[u32]],
skip_special_tokens: bool,
) -> Result<Vec<String>>
where
M: Send + Sync,
{
sentences
.into_maybe_par_iter()
.map(|sentence| self.decode(sentence, skip_special_tokens))
.collect()
}
/// Train our Model from files
pub fn train_from_files<T>(&mut self, trainer: &mut T, files: Vec<String>) -> Result<&mut Self>
where
T: Trainer<Model = M> + Sync,
{
let mut len = 0;
for file in files.iter() {
len += File::open(file)
.and_then(|f| f.metadata())
.map(|m| m.len())?;
}
let max_read = 1_000_000;
ResultShunt::process(
files.into_iter().flat_map(|filename| {
match File::open(filename) {
Ok(file) => {
let file = BufReader::with_capacity(max_read, file);
// We read new lines using this API instead of the Lines Iterator
// on purpose. We want to keep the `\n` and potential `\r` between each lines
// We use an iterator to be able to chain with par_bridge.
itertools::Either::Left(file.lines_with_ending())
}
Err(e) => itertools::Either::Right(std::iter::once(Err(e))),
}
}),
|sequences| -> Result<()> {
let progress = if trainer.should_show_progress() {
let progress = ProgressBar::new(len);
progress.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}] {msg:<30!} {wide_bar} {percent:>18!}%")
.expect("Invalid progress template"),
);
progress
.set_message(format!("Pre-processing files ({:.2} Mo)", len / 1_000_000));
Some(progress)
} else {
None
};
trainer.feed(
sequences.inspect(|s| {
if let Some(progress) = &progress {
progress.inc(s.len() as u64)
}
}),
|seq| {
let normalized = self.do_normalize(seq.as_ref())?;
let pre_tokenized = self.do_pre_tokenize(normalized)?;
Ok(pre_tokenized
.get_splits(OffsetReferential::Original, OffsetType::Byte)
.into_iter()
.map(|(s, _, _)| s.to_owned())
.collect())
},
)?;
if let Some(pbar) = progress {
pbar.finish();
}
let special_tokens = trainer.train(&mut self.model)?;
self.add_special_tokens(&special_tokens);
Ok(())
},
)??;
Ok(self)
}
/// Train our Model, using the given Trainer and iterator
pub fn train<T, I, S>(&mut self, trainer: &mut T, sequences: I) -> Result<&mut Self>
where
T: Trainer<Model = M> + Sync,
I: Iterator<Item = S> + Send,
S: AsRef<str> + Send,
{
let (lower, upper) = sequences.size_hint();
let len = upper.unwrap_or(lower) as u64;
let progress = if trainer.should_show_progress() {
let progress = ProgressBar::new(len);
progress.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}] {msg:<30!} {wide_bar} {pos:<9!}/{len:>9!}")
.expect("Invalid progress template"),
);
progress.set_message("Pre-processing sequences");
Some(progress)
} else {
None
};
trainer.feed(
sequences.inspect(|_s| {
if let Some(progress) = &progress {
progress.inc(1)
}
}),
|seq| {
let normalized = self.do_normalize(seq.as_ref())?;
let pre_tokenized = self.do_pre_tokenize(normalized)?;
Ok(pre_tokenized
.get_splits(OffsetReferential::Original, OffsetType::Byte)
.into_iter()
.map(|(s, _, _)| s.to_owned())
.collect())
},
)?;
if let Some(pbar) = progress {
pbar.finish();
}
let special_tokens = trainer.train(&mut self.model)?;
self.add_special_tokens(&special_tokens);
Ok(self)
}
}
impl<M, N, PT, PP, D> std::str::FromStr for TokenizerImpl<M, N, PT, PP, D>
where
M: for<'de> Deserialize<'de> + Model,
N: for<'de> Deserialize<'de> + Normalizer,
PT: for<'de> Deserialize<'de> + PreTokenizer,
PP: for<'de> Deserialize<'de> + PostProcessor,
D: for<'de> Deserialize<'de> + Decoder,
{
type Err = Error;
fn from_str(s: &str) -> Result<Self> {
Ok(serde_json::from_str(s)?)
}
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
M: DeserializeOwned + Model,
N: DeserializeOwned + Normalizer,
PT: DeserializeOwned + PreTokenizer,
PP: DeserializeOwned + PostProcessor,
D: DeserializeOwned + Decoder,
{
/// Instantiate a new Tokenizer from the given file
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self> {
let content = read_to_string(file)?;
let tokenizer = serde_json::from_str(&content)?;
Ok(tokenizer)
}
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
M: DeserializeOwned + Model,
N: DeserializeOwned + Normalizer,
PT: DeserializeOwned + PreTokenizer,
PP: DeserializeOwned + PostProcessor,
D: DeserializeOwned + Decoder,
{
/// Instantiate a new Tokenizer from bytes
pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self> {
let tokenizer = serde_json::from_slice(bytes.as_ref())?;
Ok(tokenizer)
}
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
M: DeserializeOwned + Model,
N: DeserializeOwned + Normalizer,
PT: DeserializeOwned + PreTokenizer,
PP: DeserializeOwned + PostProcessor,
D: DeserializeOwned + Decoder,
{
#[deprecated(
since = "0.14.0",
note = "Users should download the file separately using https://github.com/huggingface/hf-hub instead, which splits concerns of accessing the web, and should use the new cache layout"
)]
#[cfg(feature = "http")]
/// Instantiate a new Tokenizer from a file hosted on the Hugging Face Hub.
/// It expects the `identifier` of a model that includes a `tokenizer.json` file.
pub fn from_pretrained<S: AsRef<str>>(
identifier: S,
params: Option<crate::utils::from_pretrained::FromPretrainedParameters>,
) -> Result<Self> {
let tokenizer_file = crate::utils::from_pretrained::from_pretrained(identifier, params)?;
TokenizerImpl::from_file(tokenizer_file)
}
}
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
where
M: Serialize,
N: Serialize,
PT: Serialize,
PP: Serialize,
D: Serialize,
{
/// Serialize the current tokenizer as a String
pub fn to_string(&self, pretty: bool) -> Result<String> {
Ok(if pretty {
serde_json::to_string_pretty(self)?
} else {
serde_json::to_string(self)?
})
}
/// Save the current tokenizer at the given path
pub fn save<P: AsRef<Path>>(&self, path: P, pretty: bool) -> Result<()> {
let serialized = self.to_string(pretty)?;
let mut file = File::create(path)?;
file.write_all(serialized.as_bytes())?;
Ok(())
}
}
#[cfg(test)]
mod test {
#[cfg(feature = "http")]
#[test]
fn test_decoding_with_added_bpe() {
use crate::{
normalizers,
pre_tokenizers::split::{Split, SplitPattern},
AddedToken, NormalizerWrapper, PreTokenizerWrapper, SplitDelimiterBehavior, Tokenizer,
};
let mut tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3-8B", None).unwrap();
tokenizer.normalizer = Some(NormalizerWrapper::from(normalizers::ByteLevel::new()));
tokenizer.pre_tokenizer = Some(PreTokenizerWrapper::Split(
Split::new(
SplitPattern::Regex(r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+".into()),
SplitDelimiterBehavior::Isolated,
false,
)
.unwrap(),
));
tokenizer.add_tokens(&[AddedToken::from("嗎", false).normalized(false)]);
let encoded = tokenizer
.encode("Hey! how is this token: 嗎", false)
.unwrap();
assert_eq!(
encoded.get_ids(),
[19182, 0, 1268, 602, 82, 62428, 82, 4037, 25, 220, 128256]
);
assert_eq!(
encoded.get_tokens(),
["Hey", "!", "Ġhow", "Ġi", "s", "Ġthi", "s", "Ġtoken", ":", "Ġ", "嗎"]
);
let decoded = tokenizer.decode(encoded.get_ids(), false);
assert_eq!(decoded.unwrap(), "Hey! how is this token: 嗎");
tokenizer.add_tokens(&[AddedToken::from("д", false).normalized(true)]);
let encoded = tokenizer
.encode("Hey! how is this token: д", false)
.unwrap();
assert_eq!(
encoded.get_ids(),
[19182, 0, 1268, 602, 82, 62428, 82, 4037, 25, 220, 128257]
);
assert_eq!(
encoded.get_tokens(),
["Hey", "!", "Ġhow", "Ġi", "s", "Ġthi", "s", "Ġtoken", ":", "Ġ", "д"]
);
let decoded = tokenizer.decode(encoded.get_ids(), false);
assert_eq!(decoded.unwrap(), "Hey! how is this token: д")
}
}