Spaces:
Build error
Build error
| from typing import List, Optional | |
| from .model import AccentPhrase, Mora, ParseKanaError, ParseKanaErrorCode | |
| from .mora_list import openjtalk_text2mora | |
| LOOP_LIMIT = 300 | |
| UNVOICE_SYMBOL = "_" | |
| ACCENT_SYMBOL = "'" | |
| NOPAUSE_DELIMITER = "/" | |
| PAUSE_DELIMITER = "、" | |
| WIDE_INTERROGATION_MARK = "?" | |
| text2mora_with_unvoice = {} | |
| for text, (consonant, vowel) in openjtalk_text2mora.items(): | |
| text2mora_with_unvoice[text] = Mora( | |
| text=text, | |
| consonant=consonant if len(consonant) > 0 else None, | |
| consonant_length=0 if len(consonant) > 0 else None, | |
| vowel=vowel, | |
| vowel_length=0, | |
| pitch=0, | |
| is_interrogative=False, | |
| ) | |
| if vowel in ["a", "i", "u", "e", "o"]: | |
| text2mora_with_unvoice[UNVOICE_SYMBOL + text] = Mora( | |
| text=text, | |
| consonant=consonant if len(consonant) > 0 else None, | |
| consonant_length=0 if len(consonant) > 0 else None, | |
| vowel=vowel.upper(), | |
| vowel_length=0, | |
| pitch=0, | |
| is_interrogative=False, | |
| ) | |
| def _text_to_accent_phrase(phrase: str) -> AccentPhrase: | |
| """ | |
| longest matchにより読み仮名からAccentPhraseを生成 | |
| 入力長Nに対し計算量O(N^2) | |
| """ | |
| accent_index: Optional[int] = None | |
| moras: List[Mora] = [] | |
| base_index = 0 # パース開始位置。ここから右の文字列をstackに詰めていく。 | |
| stack = "" # 保留中の文字列 | |
| matched_text: Optional[str] = None # 保留中の文字列内で最後にマッチした仮名 | |
| outer_loop = 0 | |
| while base_index < len(phrase): | |
| outer_loop += 1 | |
| if phrase[base_index] == ACCENT_SYMBOL: | |
| if len(moras) == 0: | |
| raise ParseKanaError(ParseKanaErrorCode.ACCENT_TOP, text=phrase) | |
| if accent_index is not None: | |
| raise ParseKanaError(ParseKanaErrorCode.ACCENT_TWICE, text=phrase) | |
| accent_index = len(moras) | |
| base_index += 1 | |
| continue | |
| for watch_index in range(base_index, len(phrase)): | |
| if phrase[watch_index] == ACCENT_SYMBOL: | |
| break | |
| # 普通の文字の場合 | |
| stack += phrase[watch_index] | |
| if stack in text2mora_with_unvoice: | |
| matched_text = stack | |
| # push mora | |
| if matched_text is None: | |
| raise ParseKanaError(ParseKanaErrorCode.UNKNOWN_TEXT, text=stack) | |
| else: | |
| moras.append(text2mora_with_unvoice[matched_text].copy(deep=True)) | |
| base_index += len(matched_text) | |
| stack = "" | |
| matched_text = None | |
| if outer_loop > LOOP_LIMIT: | |
| raise ParseKanaError(ParseKanaErrorCode.INFINITE_LOOP) | |
| if accent_index is None: | |
| raise ParseKanaError(ParseKanaErrorCode.ACCENT_NOTFOUND, text=phrase) | |
| else: | |
| return AccentPhrase(moras=moras, accent=accent_index, pause_mora=None) | |
| def parse_kana(text: str) -> List[AccentPhrase]: | |
| """ | |
| AquesTalkライクな読み仮名をパースして音長・音高未指定のaccent phraseに変換 | |
| """ | |
| parsed_results: List[AccentPhrase] = [] | |
| phrase_base = 0 | |
| if len(text) == 0: | |
| raise ParseKanaError(ParseKanaErrorCode.EMPTY_PHRASE, position=1) | |
| for i in range(len(text) + 1): | |
| if i == len(text) or text[i] in [PAUSE_DELIMITER, NOPAUSE_DELIMITER]: | |
| phrase = text[phrase_base:i] | |
| if len(phrase) == 0: | |
| raise ParseKanaError( | |
| ParseKanaErrorCode.EMPTY_PHRASE, | |
| position=str(len(parsed_results) + 1), | |
| ) | |
| phrase_base = i + 1 | |
| is_interrogative = WIDE_INTERROGATION_MARK in phrase | |
| if is_interrogative: | |
| if WIDE_INTERROGATION_MARK in phrase[:-1]: | |
| raise ParseKanaError( | |
| ParseKanaErrorCode.INTERROGATION_MARK_NOT_AT_END, text=phrase | |
| ) | |
| phrase = phrase.replace(WIDE_INTERROGATION_MARK, "") | |
| accent_phrase: AccentPhrase = _text_to_accent_phrase(phrase) | |
| if i < len(text) and text[i] == PAUSE_DELIMITER: | |
| accent_phrase.pause_mora = Mora( | |
| text="、", | |
| consonant=None, | |
| consonant_length=None, | |
| vowel="pau", | |
| vowel_length=0, | |
| pitch=0, | |
| ) | |
| accent_phrase.is_interrogative = is_interrogative | |
| parsed_results.append(accent_phrase) | |
| return parsed_results | |
| def create_kana(accent_phrases: List[AccentPhrase]) -> str: | |
| text = "" | |
| for i, phrase in enumerate(accent_phrases): | |
| for j, mora in enumerate(phrase.moras): | |
| if mora.vowel in ["A", "I", "U", "E", "O"]: | |
| text += UNVOICE_SYMBOL | |
| text += mora.text | |
| if j + 1 == phrase.accent: | |
| text += ACCENT_SYMBOL | |
| if phrase.is_interrogative: | |
| text += WIDE_INTERROGATION_MARK | |
| if i < len(accent_phrases) - 1: | |
| if phrase.pause_mora is None: | |
| text += NOPAUSE_DELIMITER | |
| else: | |
| text += PAUSE_DELIMITER | |
| return text | |