| """Set of default text cleaners""" |
| |
|
|
| import re |
|
|
| |
| _whitespace_re = re.compile(r"\s+") |
|
|
| rep_map = { |
| ":": ",", |
| ";": ",", |
| ",": ",", |
| "。": ".", |
| "!": "!", |
| "?": "?", |
| "\n": ".", |
| "·": ",", |
| "、": ",", |
| "...": ".", |
| "…": ".", |
| "$": ".", |
| "“": "'", |
| "”": "'", |
| "‘": "'", |
| "’": "'", |
| "(": "'", |
| ")": "'", |
| "(": "'", |
| ")": "'", |
| "《": "'", |
| "》": "'", |
| "【": "'", |
| "】": "'", |
| "[": "'", |
| "]": "'", |
| "—": "", |
| "~": "-", |
| "~": "-", |
| "「": "'", |
| "」": "'", |
| } |
|
|
| def replace_punctuation(text): |
| pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) |
| replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) |
| return replaced_text |
|
|
| def lowercase(text): |
| return text.lower() |
|
|
|
|
| def collapse_whitespace(text): |
| return re.sub(_whitespace_re, " ", text).strip() |
|
|
| def remove_punctuation_at_begin(text): |
| return re.sub(r'^[,.!?]+', '', text) |
|
|
| def remove_aux_symbols(text): |
| text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text) |
| return text |
|
|
|
|
| def replace_symbols(text, lang="en"): |
| """Replace symbols based on the lenguage tag. |
| |
| Args: |
| text: |
| Input text. |
| lang: |
| Lenguage identifier. ex: "en", "fr", "pt", "ca". |
| |
| Returns: |
| The modified text |
| example: |
| input args: |
| text: "si l'avi cau, diguem-ho" |
| lang: "ca" |
| Output: |
| text: "si lavi cau, diguemho" |
| """ |
| text = text.replace(";", ",") |
| text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") |
| text = text.replace(":", ",") |
| if lang == "en": |
| text = text.replace("&", " and ") |
| elif lang == "fr": |
| text = text.replace("&", " et ") |
| elif lang == "pt": |
| text = text.replace("&", " e ") |
| elif lang == "ca": |
| text = text.replace("&", " i ") |
| text = text.replace("'", "") |
| elif lang== "es": |
| text=text.replace("&","y") |
| text = text.replace("'", "") |
| return text |
|
|
| def spanish_cleaners(text): |
| """Basic pipeline for Portuguese text. There is no need to expand abbreviation and |
| numbers, phonemizer already does that""" |
| text = lowercase(text) |
| text = replace_symbols(text, lang="es") |
| text = replace_punctuation(text) |
| text = remove_aux_symbols(text) |
| text = remove_punctuation_at_begin(text) |
| text = collapse_whitespace(text) |
| text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text) |
| return text |
|
|
|
|