| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import re |
| | import unicodedata |
| |
|
| | import regex |
| |
|
| | |
| | ADDITIONAL_DIACRITICS = { |
| | "œ": "oe", |
| | "Œ": "OE", |
| | "ø": "o", |
| | "Ø": "O", |
| | "æ": "ae", |
| | "Æ": "AE", |
| | "ß": "ss", |
| | "ẞ": "SS", |
| | "đ": "d", |
| | "Đ": "D", |
| | "ð": "d", |
| | "Ð": "D", |
| | "þ": "th", |
| | "Þ": "th", |
| | "ł": "l", |
| | "Ł": "L", |
| | } |
| |
|
| |
|
| | def remove_symbols_and_diacritics(s: str, keep=""): |
| | """ |
| | Replace any other markers, symbols, and punctuations with a space, |
| | and drop any diacritics (category 'Mn' and some manual mappings) |
| | """ |
| | return "".join( |
| | c |
| | if c in keep |
| | else ADDITIONAL_DIACRITICS[c] |
| | if c in ADDITIONAL_DIACRITICS |
| | else "" |
| | if unicodedata.category(c) == "Mn" |
| | else " " |
| | if unicodedata.category(c)[0] in "MSP" |
| | else c |
| | for c in unicodedata.normalize("NFKD", s) |
| | ) |
| |
|
| |
|
| | def remove_symbols(s: str): |
| | """ |
| | Replace any other markers, symbols, punctuations with a space, keeping diacritics |
| | """ |
| | return "".join( |
| | " " if unicodedata.category(c)[0] in "MSP" else c |
| | for c in unicodedata.normalize("NFKC", s) |
| | ) |
| |
|
| |
|
| | class BasicTextNormalizer: |
| | def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): |
| | self.clean = ( |
| | remove_symbols_and_diacritics if remove_diacritics else remove_symbols |
| | ) |
| | self.split_letters = split_letters |
| |
|
| | def __call__(self, s: str): |
| | s = s.lower() |
| | s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) |
| | s = re.sub(r"\(([^)]+?)\)", "", s) |
| | s = self.clean(s).lower() |
| |
|
| | if self.split_letters: |
| | s = " ".join(regex.findall(r"\X", s, regex.U)) |
| |
|
| | s = re.sub( |
| | r"\s+", " ", s |
| | ) |
| |
|
| | return s |
| |
|