| import json
|
| import re
|
| import unicodedata
|
|
|
| from utils.norm_config import norm_config
|
|
|
|
|
| def text_normalize(
|
| text,
|
| iso_code="xxx",
|
| lower_case=True,
|
| remove_numbers=False,
|
| remove_brackets=False,
|
| rm_extra_spaces=False,
|
| ):
|
|
|
| """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
|
|
|
| Args:
|
| text : The string to be normalized
|
| iso_code :
|
| remove_numbers : Boolean flag to specify if words containing only digits should be removed
|
|
|
| Returns:
|
| normalized_text : the string after all normalization
|
|
|
| """
|
|
|
| config = norm_config.get(iso_code, norm_config["*"])
|
|
|
| for field in [
|
| "lower_case",
|
| "punc_set",
|
| "del_set",
|
| "mapping",
|
| "digit_set",
|
| "unicode_norm",
|
| ]:
|
| if field not in config:
|
| config[field] = norm_config["*"][field]
|
|
|
| text = unicodedata.normalize(config["unicode_norm"], text)
|
|
|
|
|
|
|
| if config["lower_case"] and lower_case:
|
| text = text.lower()
|
|
|
|
|
|
|
|
|
| text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
| if remove_brackets:
|
| text = re.sub(r"\([^\)]*\)", " ", text)
|
|
|
|
|
|
|
| for old, new in config["mapping"].items():
|
| text = re.sub(old, new, text)
|
|
|
|
|
|
|
| punct_pattern = r"[" + config["punc_set"]
|
|
|
| punct_pattern += "]"
|
|
|
| normalized_text = re.sub(punct_pattern, " ", text)
|
|
|
|
|
|
|
| delete_patten = r"[" + config["del_set"] + "]"
|
|
|
| normalized_text = re.sub(delete_patten, "", normalized_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| if remove_numbers:
|
|
|
| digits_pattern = "[" + config["digit_set"]
|
|
|
| digits_pattern += "]+"
|
|
|
| complete_digit_pattern = (
|
| r"^"
|
| + digits_pattern
|
| + "(?=\s)|(?<=\s)"
|
| + digits_pattern
|
| + "(?=\s)|(?<=\s)"
|
| + digits_pattern
|
| + "$"
|
| )
|
|
|
| normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
|
|
|
| if config["rm_diacritics"]:
|
| from unidecode import unidecode
|
|
|
| normalized_text = unidecode(normalized_text)
|
|
|
| if rm_extra_spaces:
|
| normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
|
|
|
| return normalized_text
|
|
|