| | """Set of default text cleaners""" |
| | |
| |
|
| | import re |
| |
|
| | |
| | _whitespace_re = re.compile(r"\s+") |
| |
|
| | rep_map = { |
| | ":": ",", |
| | ";": ",", |
| | ",": ",", |
| | "。": ".", |
| | "!": "!", |
| | "?": "?", |
| | "\n": ".", |
| | "·": ",", |
| | "、": ",", |
| | "...": ".", |
| | "…": ".", |
| | "$": ".", |
| | "“": "'", |
| | "”": "'", |
| | "‘": "'", |
| | "’": "'", |
| | "(": "'", |
| | ")": "'", |
| | "(": "'", |
| | ")": "'", |
| | "《": "'", |
| | "》": "'", |
| | "【": "'", |
| | "】": "'", |
| | "[": "'", |
| | "]": "'", |
| | "—": "", |
| | "~": "-", |
| | "~": "-", |
| | "「": "'", |
| | "」": "'", |
| | } |
| |
|
| | def replace_punctuation(text): |
| | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) |
| | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) |
| | return replaced_text |
| |
|
| | def lowercase(text): |
| | return text.lower() |
| |
|
| |
|
| | def collapse_whitespace(text): |
| | return re.sub(_whitespace_re, " ", text).strip() |
| |
|
| | def remove_punctuation_at_begin(text): |
| | return re.sub(r'^[,.!?]+', '', text) |
| |
|
| | def remove_aux_symbols(text): |
| | text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text) |
| | return text |
| |
|
| |
|
| | def replace_symbols(text, lang="en"): |
| | """Replace symbols based on the lenguage tag. |
| | |
| | Args: |
| | text: |
| | Input text. |
| | lang: |
| | Lenguage identifier. ex: "en", "fr", "pt", "ca". |
| | |
| | Returns: |
| | The modified text |
| | example: |
| | input args: |
| | text: "si l'avi cau, diguem-ho" |
| | lang: "ca" |
| | Output: |
| | text: "si lavi cau, diguemho" |
| | """ |
| | text = text.replace(";", ",") |
| | text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") |
| | text = text.replace(":", ",") |
| | if lang == "en": |
| | text = text.replace("&", " and ") |
| | elif lang == "fr": |
| | text = text.replace("&", " et ") |
| | elif lang == "pt": |
| | text = text.replace("&", " e ") |
| | elif lang == "ca": |
| | text = text.replace("&", " i ") |
| | text = text.replace("'", "") |
| | elif lang== "es": |
| | text=text.replace("&","y") |
| | text = text.replace("'", "") |
| | return text |
| |
|
| | def unicleaners(text, cased=False, lang='en'): |
| | """Basic pipeline for Portuguese text. There is no need to expand abbreviation and |
| | numbers, phonemizer already does that""" |
| | if not cased: |
| | text = lowercase(text) |
| | text = replace_punctuation(text) |
| | text = replace_symbols(text, lang=lang) |
| | text = remove_aux_symbols(text) |
| | text = remove_punctuation_at_begin(text) |
| | text = collapse_whitespace(text) |
| | text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text) |
| | return text |
| |
|
| |
|