use crate::tokenizer::{NormalizedString, Normalizer, Result}; use serde::{Deserialize, Serialize}; use unicode_categories::UnicodeCategories; /// Checks whether a character is whitespace fn is_whitespace(c: char) -> bool { // These are technically control characters but we count them as whitespace match c { '\t' | '\n' | '\r' => true, _ => c.is_whitespace(), } } /// Checks whether a character is a control character fn is_control(c: char) -> bool { // These are technically control characters but we count them as whitespace match c { '\t' | '\n' | '\r' => false, // The definition of `is_control` here is quite large and contains also // Cc, Cf, Cn or Co // cf. https://unicode.org/reports/tr44/ (Table 12) _ => c.is_other(), } } /// Checks whether a character is chinese /// This defines a "chinese character" as anything in the CJK Unicode block: /// https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) /// /// Note that the CJK Unicode block is NOT all Japanese and Korean characters, /// despite its name. The modern Korean Hangul alphabet is a different block, /// as is Japanese Hiragana and Katakana. Those alphabets are used to write /// space-separated words, so they are not treated specially and handled /// like for all of the other languages. fn is_chinese_char(c: char) -> bool { matches!( c as usize, 0x4E00..=0x9FFF | 0x3400..=0x4DBF | 0x20000..=0x2A6DF | 0x2A700..=0x2B73F | 0x2B740..=0x2B81F | 0x2B920..=0x2CEAF | 0xF900..=0xFAFF | 0x2F800..=0x2FA1F ) } #[derive(Copy, Clone, Debug, Deserialize, Serialize)] #[serde(tag = "type")] #[non_exhaustive] pub struct BertNormalizer { /// Whether to do the bert basic cleaning: /// 1. Remove any control characters /// 2. Replace all sorts of whitespace by the classic one ` ` pub clean_text: bool, /// Whether to put spaces around chinese characters so they get split pub handle_chinese_chars: bool, /// Whether to strip accents pub strip_accents: Option, /// Whether to lowercase the input pub lowercase: bool, } impl Default for BertNormalizer { fn default() -> Self { Self { clean_text: true, handle_chinese_chars: true, strip_accents: None, lowercase: true, } } } impl BertNormalizer { pub fn new( clean_text: bool, handle_chinese_chars: bool, strip_accents: Option, lowercase: bool, ) -> Self { Self { clean_text, handle_chinese_chars, strip_accents, lowercase, } } fn do_clean_text(&self, normalized: &mut NormalizedString) { normalized .filter(|c| !(c as usize == 0 || c as usize == 0xfffd || is_control(c))) .map(|c| if is_whitespace(c) { ' ' } else { c }); } fn do_handle_chinese_chars(&self, normalized: &mut NormalizedString) { let mut new_chars: Vec<(char, isize)> = vec![]; normalized.for_each(|c| { if is_chinese_char(c) { new_chars.extend([(' ', 0), (c, 1), (' ', 1)]); } else { new_chars.push((c, 0)); } }); normalized.transform(new_chars, 0); } fn do_strip_accents(&self, normalized: &mut NormalizedString) { normalized.nfd().filter(|c| !c.is_mark_nonspacing()); } fn do_lowercase(&self, normalized: &mut NormalizedString) { normalized.lowercase(); } } impl Normalizer for BertNormalizer { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { if self.clean_text { self.do_clean_text(normalized); } if self.handle_chinese_chars { self.do_handle_chinese_chars(normalized); } let strip_accents = self.strip_accents.unwrap_or(self.lowercase); if strip_accents { self.do_strip_accents(normalized); } if self.lowercase { self.do_lowercase(normalized); } Ok(()) } }