| use crate::pre_tokenizers::PreTokenizerWrapper; |
| use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result}; |
| use crate::utils::macro_rules_attribute; |
| use serde::{Deserialize, Serialize}; |
|
|
| #[derive(Clone, Debug, PartialEq)] |
| #[macro_rules_attribute(impl_serde_type!)] |
| pub struct Sequence { |
| pretokenizers: Vec<PreTokenizerWrapper>, |
| } |
|
|
| impl Sequence { |
| pub fn new(pretokenizers: Vec<PreTokenizerWrapper>) -> Self { |
| Self { pretokenizers } |
| } |
|
|
| pub fn get_pre_tokenizers(&self) -> &[PreTokenizerWrapper] { |
| &self.pretokenizers |
| } |
|
|
| pub fn get_pre_tokenizers_mut(&mut self) -> &mut [PreTokenizerWrapper] { |
| &mut self.pretokenizers |
| } |
| } |
|
|
| impl PreTokenizer for Sequence { |
| fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { |
| for pretokenizer in &self.pretokenizers { |
| pretokenizer.pre_tokenize(pretokenized)?; |
| } |
| Ok(()) |
| } |
| } |
|
|
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| use crate::pre_tokenizers::{punctuation::Punctuation, whitespace::WhitespaceSplit}; |
| use crate::{OffsetReferential, OffsetType}; |
|
|
| #[test] |
| fn sequence_basic() { |
| let pretokenizers = vec![ |
| PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit), |
| PreTokenizerWrapper::Punctuation(Punctuation::default()), |
| ]; |
| let pretok = Sequence::new(pretokenizers); |
| let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into(); |
| pretok.pre_tokenize(&mut pretokenized).unwrap(); |
| assert_eq!( |
| pretokenized |
| .get_splits(OffsetReferential::Original, OffsetType::Byte) |
| .into_iter() |
| .map(|(s, o, _)| (s, o)) |
| .collect::<Vec<_>>(), |
| vec![ |
| ("Hey", (0, 3)), |
| ("friend", (4, 10)), |
| ("!", (10, 11)), |
| ("How", (16, 19)), |
| ("are", (20, 23)), |
| ("you", (24, 27)), |
| ("?", (27, 28)), |
| ("!", (28, 29)), |
| ("?", (29, 30)), |
| ] |
| ); |
| } |
| } |
|
|