| |
| import re |
| import sys |
|
|
| import pyopenjtalk |
|
|
| from text.symbols import punctuation |
| |
| _japanese_characters = re.compile( |
| r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" |
| ) |
|
|
| |
| _japanese_marks = re.compile( |
| r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" |
| ) |
|
|
| |
| _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] |
|
|
|
|
| |
| _real_sokuon = [ |
| (re.compile("%s" % x[0]), x[1]) |
| for x in [ |
| (r"Q([↑↓]*[kg])", r"k#\1"), |
| (r"Q([↑↓]*[tdjʧ])", r"t#\1"), |
| (r"Q([↑↓]*[sʃ])", r"s\1"), |
| (r"Q([↑↓]*[pb])", r"p#\1"), |
| ] |
| ] |
|
|
| |
| _real_hatsuon = [ |
| (re.compile("%s" % x[0]), x[1]) |
| for x in [ |
| (r"N([↑↓]*[pbm])", r"m\1"), |
| (r"N([↑↓]*[ʧʥj])", r"n^\1"), |
| (r"N([↑↓]*[tdn])", r"n\1"), |
| (r"N([↑↓]*[kg])", r"ŋ\1"), |
| ] |
| ] |
|
|
|
|
| def post_replace_ph(ph): |
| rep_map = { |
| ":": ",", |
| ";": ",", |
| ",": ",", |
| "。": ".", |
| "!": "!", |
| "?": "?", |
| "\n": ".", |
| "·": ",", |
| "、": ",", |
| "...": "…", |
| } |
|
|
| if ph in rep_map.keys(): |
| ph = rep_map[ph] |
| |
| |
| |
| |
| return ph |
|
|
|
|
| def replace_consecutive_punctuation(text): |
| punctuations = ''.join(re.escape(p) for p in punctuation) |
| pattern = f'([{punctuations}])([{punctuations}])+' |
| result = re.sub(pattern, r'\1', text) |
| return result |
|
|
|
|
| def symbols_to_japanese(text): |
| for regex, replacement in _symbols_to_japanese: |
| text = re.sub(regex, replacement, text) |
| return text |
|
|
|
|
| def preprocess_jap(text, with_prosody=False): |
| """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" |
| text = symbols_to_japanese(text) |
| sentences = re.split(_japanese_marks, text) |
| marks = re.findall(_japanese_marks, text) |
| text = [] |
| for i, sentence in enumerate(sentences): |
| if re.match(_japanese_characters, sentence): |
| if with_prosody: |
| text += pyopenjtalk_g2p_prosody(sentence)[1:-1] |
| else: |
| p = pyopenjtalk.g2p(sentence) |
| text += p.split(" ") |
|
|
| if i < len(marks): |
| if marks[i] == " ": |
| continue |
| text += [marks[i].replace(" ", "")] |
| return text |
|
|
|
|
| def text_normalize(text): |
| |
|
|
| |
| text = replace_consecutive_punctuation(text) |
| return text |
|
|
| |
| def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True): |
| """Extract phoneme + prosoody symbol sequence from input full-context labels. |
| |
| The algorithm is based on `Prosodic features control by symbols as input of |
| sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks. |
| |
| Args: |
| text (str): Input text. |
| drop_unvoiced_vowels (bool): whether to drop unvoiced vowels. |
| |
| Returns: |
| List[str]: List of phoneme + prosody symbols. |
| |
| Examples: |
| >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody |
| >>> pyopenjtalk_g2p_prosody("こんにちは。") |
| ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$'] |
| |
| .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic |
| modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104 |
| |
| """ |
| labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) |
| N = len(labels) |
|
|
| phones = [] |
| for n in range(N): |
| lab_curr = labels[n] |
|
|
| |
| p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) |
| |
| if drop_unvoiced_vowels and p3 in "AEIOU": |
| p3 = p3.lower() |
|
|
| |
| if p3 == "sil": |
| assert n == 0 or n == N - 1 |
| if n == 0: |
| phones.append("^") |
| elif n == N - 1: |
| |
| e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr) |
| if e3 == 0: |
| phones.append("$") |
| elif e3 == 1: |
| phones.append("?") |
| continue |
| elif p3 == "pau": |
| phones.append("_") |
| continue |
| else: |
| phones.append(p3) |
|
|
| |
| a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) |
| a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) |
| a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr) |
|
|
| |
| f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) |
|
|
| a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1]) |
| |
| if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": |
| phones.append("#") |
| |
| elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: |
| phones.append("]") |
| |
| elif a2 == 1 and a2_next == 2: |
| phones.append("[") |
|
|
| return phones |
|
|
| |
| def _numeric_feature_by_regex(regex, s): |
| match = re.search(regex, s) |
| if match is None: |
| return -50 |
| return int(match.group(1)) |
|
|
| def g2p(norm_text, with_prosody=True): |
| phones = preprocess_jap(norm_text, with_prosody) |
| phones = [post_replace_ph(i) for i in phones] |
| |
| return phones |
|
|
|
|
| if __name__ == "__main__": |
| phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!") |
| print(phones) |
|
|