use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::utils::macro_rules_attribute; use unicode_categories::UnicodeCategories; fn is_bert_punc(x: char) -> bool { char::is_ascii_punctuation(&x) || x.is_punctuation() } #[derive(Copy, Clone, Debug, PartialEq, Eq)] #[macro_rules_attribute(impl_serde_type!)] pub struct BertPreTokenizer; impl PreTokenizer for BertPreTokenizer { fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { pretokenized.split(|_, s| s.split(char::is_whitespace, SplitDelimiterBehavior::Removed))?; pretokenized.split(|_, s| s.split(is_bert_punc, SplitDelimiterBehavior::Isolated)) } } #[cfg(test)] mod tests { use super::*; use crate::{NormalizedString, OffsetReferential, OffsetType}; #[test] fn basic() { let pretok = BertPreTokenizer; let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into(); pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( pretokenized .get_splits(OffsetReferential::Original, OffsetType::Byte) .into_iter() .map(|(s, o, _)| (s, o)) .collect::>(), vec![ ("Hey", (0, 3)), ("friend", (4, 10)), ("!", (10, 11)), ("How", (16, 19)), ("are", (20, 23)), ("you", (24, 27)), ("?", (27, 28)), ("!", (28, 29)), ("?", (29, 30)), ] ); } #[test] fn chinese_chars() { let mut n = NormalizedString::from("野口里佳 Noguchi Rika"); n.transform( n.get().to_owned().chars().flat_map(|c| { if (c as usize) > 0x4E00 { vec![(' ', 0), (c, 1), (' ', 1)] } else { vec![(c, 0)] } }), 0, ); let mut pretokenized = n.into(); let pretok = BertPreTokenizer; pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( pretokenized .get_splits(OffsetReferential::Original, OffsetType::Byte) .into_iter() .map(|(s, o, _)| (s, o)) .collect::>(), vec![ ("野", (0, 3)), ("口", (3, 6)), ("里", (6, 9)), ("佳", (9, 12)), ("Noguchi", (13, 20)), ("Rika", (21, 25)) ] ); } }