| use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; |
| use crate::utils::macro_rules_attribute; |
| use unicode_categories::UnicodeCategories; |
|
|
| fn is_bert_punc(x: char) -> bool { |
| char::is_ascii_punctuation(&x) || x.is_punctuation() |
| } |
|
|
| #[derive(Copy, Clone, Debug, PartialEq, Eq)] |
| #[macro_rules_attribute(impl_serde_type!)] |
| pub struct BertPreTokenizer; |
|
|
| impl PreTokenizer for BertPreTokenizer { |
| fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { |
| pretokenized.split(|_, s| s.split(char::is_whitespace, SplitDelimiterBehavior::Removed))?; |
| pretokenized.split(|_, s| s.split(is_bert_punc, SplitDelimiterBehavior::Isolated)) |
| } |
| } |
|
|
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| use crate::{NormalizedString, OffsetReferential, OffsetType}; |
|
|
| #[test] |
| fn basic() { |
| let pretok = BertPreTokenizer; |
| let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into(); |
| pretok.pre_tokenize(&mut pretokenized).unwrap(); |
| assert_eq!( |
| pretokenized |
| .get_splits(OffsetReferential::Original, OffsetType::Byte) |
| .into_iter() |
| .map(|(s, o, _)| (s, o)) |
| .collect::<Vec<_>>(), |
| vec![ |
| ("Hey", (0, 3)), |
| ("friend", (4, 10)), |
| ("!", (10, 11)), |
| ("How", (16, 19)), |
| ("are", (20, 23)), |
| ("you", (24, 27)), |
| ("?", (27, 28)), |
| ("!", (28, 29)), |
| ("?", (29, 30)), |
| ] |
| ); |
| } |
|
|
| #[test] |
| fn chinese_chars() { |
| let mut n = NormalizedString::from("野口里佳 Noguchi Rika"); |
| n.transform( |
| n.get().to_owned().chars().flat_map(|c| { |
| if (c as usize) > 0x4E00 { |
| vec![(' ', 0), (c, 1), (' ', 1)] |
| } else { |
| vec![(c, 0)] |
| } |
| }), |
| 0, |
| ); |
| let mut pretokenized = n.into(); |
| let pretok = BertPreTokenizer; |
| pretok.pre_tokenize(&mut pretokenized).unwrap(); |
| assert_eq!( |
| pretokenized |
| .get_splits(OffsetReferential::Original, OffsetType::Byte) |
| .into_iter() |
| .map(|(s, o, _)| (s, o)) |
| .collect::<Vec<_>>(), |
| vec![ |
| ("野", (0, 3)), |
| ("口", (3, 6)), |
| ("里", (6, 9)), |
| ("佳", (9, 12)), |
| ("Noguchi", (13, 20)), |
| ("Rika", (21, 25)) |
| ] |
| ); |
| } |
| } |
|
|