| | |
| | import itertools |
| | import re |
| | from typing import Dict |
| | from typing import List |
| |
|
| | import regex |
| | from gruut import sentences |
| | from gruut.const import Sentence |
| | from gruut.const import Word |
| | from AR.text_processing.symbols import SYMBOL_TO_ID |
| |
|
| |
|
| | class GruutPhonemizer: |
| | def __init__(self, language: str): |
| | self._phonemizer = sentences |
| | self.lang = language |
| | self.symbol_to_id = SYMBOL_TO_ID |
| | self._special_cases_dict: Dict[str] = { |
| | r"\.\.\.": "... ", |
| | ";": "; ", |
| | ":": ": ", |
| | ",": ", ", |
| | r"\.": ". ", |
| | "!": "! ", |
| | r"\?": "? ", |
| | "—": "—", |
| | "…": "… ", |
| | "«": "«", |
| | "»": "»", |
| | } |
| | self._punctuation_regexp: str = ( |
| | rf"([{''.join(self._special_cases_dict.keys())}])" |
| | ) |
| |
|
| | def _normalize_punctuation(self, text: str) -> str: |
| | text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) |
| | text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) |
| | text = regex.sub(r"\pZ+", r" ", text) |
| | return text.strip() |
| |
|
| | def _convert_punctuation(self, word: Word) -> str: |
| | if not word.phonemes: |
| | return "" |
| | if word.phonemes[0] in ["‖", "|"]: |
| | return word.text.strip() |
| |
|
| | phonemes = "".join(word.phonemes) |
| | |
| | phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) |
| | return phonemes.strip() |
| |
|
| | def phonemize(self, text: str, espeak: bool = False) -> str: |
| | text_to_phonemize: str = self._normalize_punctuation(text) |
| | sents: List[Sentence] = [ |
| | sent |
| | for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak) |
| | ] |
| | words: List[str] = [ |
| | self._convert_punctuation(word) for word in itertools.chain(*sents) |
| | ] |
| | return " ".join(words) |
| |
|
| | def transform(self, phonemes): |
| | |
| | |
| | return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] |
| |
|
| |
|
| | if __name__ == "__main__": |
| | phonemizer = GruutPhonemizer("en-us") |
| | |
| | phonemes = phonemizer.phonemize("Hello, wor-ld ?") |
| | print("phonemes:", phonemes) |
| | print("len(phonemes):", len(phonemes)) |
| | phoneme_ids = phonemizer.transform(phonemes) |
| | print("phoneme_ids:", phoneme_ids) |
| | print("len(phoneme_ids):", len(phoneme_ids)) |
| |
|