File size: 5,265 Bytes
72c0672 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::utils::macro_rules_attribute;
use serde::{Deserialize, Serialize};
use unicode_normalization_alignments::char::is_combining_mark;
#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
#[serde(tag = "type")]
#[non_exhaustive]
pub struct Strip {
pub strip_left: bool,
pub strip_right: bool,
}
impl Strip {
pub fn new(strip_left: bool, strip_right: bool) -> Self {
Self {
strip_left,
strip_right,
}
}
}
impl Normalizer for Strip {
/// Strip the normalized string inplace
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
if self.strip_left && self.strip_right {
// Fast path
normalized.strip();
} else {
if self.strip_left {
normalized.lstrip();
}
if self.strip_right {
normalized.rstrip();
}
}
Ok(())
}
}
// This normalizer removes combining marks from a normalized string
// It's different from unidecode as it does not attempt to modify
// non ascii languages.
#[derive(Copy, Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct StripAccents;
impl Normalizer for StripAccents {
/// Strip the normalized string inplace
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.filter(|c| !is_combining_mark(c));
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::normalizer::NormalizedString;
use crate::normalizers::Lowercase;
use crate::normalizers::NFKD;
use unicode_normalization_alignments::UnicodeNormalization;
#[test]
fn test_strip_accents() {
// Unicode combining char
let original: String = "Me llamó".nfkd().map(|(c, _)| c).collect();
let normalized = "Me llamo";
assert_ne!(original, normalized);
let mut n = NormalizedString::from(original);
StripAccents.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
// Ignores regular ascii
let original = "Me llamo";
let normalized = "Me llamo";
assert_eq!(original, normalized);
let mut n = NormalizedString::from(original);
StripAccents.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
// Does not change chinese
let original: String = "这很简单".nfkd().map(|(c, _)| c).collect();
let normalized = "这很简单";
assert_eq!(original, normalized);
let mut n = NormalizedString::from(original);
StripAccents.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
}
#[test]
fn test_vietnamese_bug() {
let original: String = "ậ…".to_string();
let normalized = "a...".to_string();
assert_ne!(original, normalized);
let mut n = NormalizedString::from(original);
NFKD.normalize(&mut n).unwrap();
StripAccents.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
Lowercase.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
let original: String = "Cụ thể, bạn sẽ tham gia một nhóm các giám đốc điều hành tổ chức, các nhà lãnh đạo doanh nghiệp, các học giả, chuyên gia phát triển và tình nguyện viên riêng biệt trong lĩnh vực phi lợi nhuận…".to_string();
let normalized = "cu the, ban se tham gia mot nhom cac giam đoc đieu hanh to chuc, cac nha lanh đao doanh nghiep, cac hoc gia, chuyen gia phat trien va tinh nguyen vien rieng biet trong linh vuc phi loi nhuan...".to_string();
let mut n = NormalizedString::from(original);
NFKD.normalize(&mut n).unwrap();
StripAccents.normalize(&mut n).unwrap();
Lowercase.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
}
#[test]
fn test_thai_bug() {
let original = "ำน\u{e49}ำ3ลำ".to_string();
let normalized = "านา3ลา".to_string();
assert_ne!(original, normalized);
let mut n = NormalizedString::from(original);
NFKD.normalize(&mut n).unwrap();
StripAccents.normalize(&mut n).unwrap();
Lowercase.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
}
#[test]
fn test_strip_accents_multiple() {
let original = "e\u{304}\u{304}\u{304}o";
let normalized = "eo";
assert_ne!(original, normalized);
let mut n = NormalizedString::from(original);
StripAccents.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
assert_eq!(
n,
NormalizedString::new(
original.to_string(),
normalized.to_string(),
vec![(0, 1), (7, 8)],
0
)
);
assert_eq!(
n.alignments_original(),
vec![
(0, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 2)
]
);
}
}
|