| import re |
| import regex |
| import unicodedata |
|
|
| from typing import Iterable |
|
|
|
|
| class BelarusianTextNormalizer: |
| """ |
| Based on transformers.models.whisper.english_normalizer.BasicTextNormalizer |
| but with support not to remove certain characters. |
| e.g. apostrophe (') - a symbol from Belarusian alphabet - was removed using BasicTextNormalizer. |
| """ |
|
|
| def __init__(self, split_letters: bool = False): |
| self.split_letters = split_letters |
| self.allowed_symbols = ("'",) |
|
|
| @staticmethod |
| def clean(s: str, allowed_symbols: Iterable[str] = None): |
| """ |
| Replace any other markers, symbols, punctuations with a space, keeping diacritics |
| """ |
| if allowed_symbols is None: |
| allowed_symbols = [] |
| res = "".join(" " if unicodedata.category(c)[0] in "MSP" and c not in allowed_symbols else c |
| for c in unicodedata.normalize("NFKC", s)) |
| return res |
|
|
| def __call__(self, s: str): |
| s = s.lower() |
| s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) |
| s = re.sub(r"\(([^)]+?)\)", "", s) |
| s = self.clean(s, allowed_symbols=self.allowed_symbols).lower() |
|
|
| if self.split_letters: |
| s = " ".join(regex.findall(r"\X", s, regex.U)) |
|
|
| s = re.sub(r"\s+", " ", s) |
|
|
| return s |
|
|