Spaces:
Build error
Build error
| from typing import List, Union | |
| from unittest import TestCase | |
| from unittest.mock import Mock | |
| import numpy | |
| from voicevox_engine.model import AccentPhrase, AudioQuery, Mora | |
| from voicevox_engine.synthesis_engine import SynthesisEngine | |
| def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, speaker_id: numpy.ndarray): | |
| result = [] | |
| # mockとしての適当な処理、特に意味はない | |
| for i in range(length): | |
| result.append(round(float(phoneme_list[i] * 0.0625 + speaker_id), 2)) | |
| return numpy.array(result) | |
| def yukarin_sa_mock( | |
| length: int, | |
| vowel_phoneme_list: numpy.ndarray, | |
| consonant_phoneme_list: numpy.ndarray, | |
| start_accent_list: numpy.ndarray, | |
| end_accent_list: numpy.ndarray, | |
| start_accent_phrase_list: numpy.ndarray, | |
| end_accent_phrase_list: numpy.ndarray, | |
| speaker_id: numpy.ndarray, | |
| ): | |
| result = [] | |
| # mockとしての適当な処理、特に意味はない | |
| for i in range(length): | |
| result.append( | |
| round( | |
| float( | |
| ( | |
| vowel_phoneme_list[0][i] | |
| + consonant_phoneme_list[0][i] | |
| + start_accent_list[0][i] | |
| + end_accent_list[0][i] | |
| + start_accent_phrase_list[0][i] | |
| + end_accent_phrase_list[0][i] | |
| ) | |
| * 0.0625 | |
| + speaker_id | |
| ), | |
| 2, | |
| ) | |
| ) | |
| return numpy.array(result)[numpy.newaxis] | |
| def decode_mock( | |
| length: int, | |
| phoneme_size: int, | |
| f0: numpy.ndarray, | |
| phoneme: numpy.ndarray, | |
| speaker_id: Union[numpy.ndarray, int], | |
| ): | |
| result = [] | |
| # mockとしての適当な処理、特に意味はない | |
| for i in range(length): | |
| # decode forwardはデータサイズがlengthの256倍になるのでとりあえず256回データをresultに入れる | |
| for _ in range(256): | |
| result.append( | |
| float( | |
| f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size) | |
| + speaker_id | |
| ) | |
| ) | |
| return numpy.array(result) | |
| def koreha_arimasuka_base_expected(): | |
| return [ | |
| AccentPhrase( | |
| moras=[ | |
| Mora( | |
| text="コ", | |
| consonant="k", | |
| consonant_length=2.44, | |
| vowel="o", | |
| vowel_length=2.88, | |
| pitch=4.38, | |
| ), | |
| Mora( | |
| text="レ", | |
| consonant="r", | |
| consonant_length=3.06, | |
| vowel="e", | |
| vowel_length=1.88, | |
| pitch=4.0, | |
| ), | |
| Mora( | |
| text="ワ", | |
| consonant="w", | |
| consonant_length=3.62, | |
| vowel="a", | |
| vowel_length=1.44, | |
| pitch=4.19, | |
| ), | |
| ], | |
| accent=3, | |
| pause_mora=None, | |
| is_interrogative=False, | |
| ), | |
| AccentPhrase( | |
| moras=[ | |
| Mora( | |
| text="ア", | |
| consonant=None, | |
| consonant_length=None, | |
| vowel="a", | |
| vowel_length=1.44, | |
| pitch=1.44, | |
| ), | |
| Mora( | |
| text="リ", | |
| consonant="r", | |
| consonant_length=3.06, | |
| vowel="i", | |
| vowel_length=2.31, | |
| pitch=4.44, | |
| ), | |
| Mora( | |
| text="マ", | |
| consonant="m", | |
| consonant_length=2.62, | |
| vowel="a", | |
| vowel_length=1.44, | |
| pitch=3.12, | |
| ), | |
| Mora( | |
| text="ス", | |
| consonant="s", | |
| consonant_length=3.19, | |
| vowel="U", | |
| vowel_length=1.38, | |
| pitch=0.0, | |
| ), | |
| Mora( | |
| text="カ", | |
| consonant="k", | |
| consonant_length=2.44, | |
| vowel="a", | |
| vowel_length=1.44, | |
| pitch=2.94, | |
| ), | |
| ], | |
| accent=3, | |
| pause_mora=None, | |
| is_interrogative=False, | |
| ), | |
| ] | |
| def create_mock_query(accent_phrases): | |
| return AudioQuery( | |
| accent_phrases=accent_phrases, | |
| speedScale=1, | |
| pitchScale=0, | |
| intonationScale=1, | |
| volumeScale=1, | |
| prePhonemeLength=0.1, | |
| postPhonemeLength=0.1, | |
| outputSamplingRate=24000, | |
| outputStereo=False, | |
| kana="", | |
| ) | |
| class MockCore: | |
| yukarin_s_forward = Mock(side_effect=yukarin_s_mock) | |
| yukarin_sa_forward = Mock(side_effect=yukarin_sa_mock) | |
| decode_forward = Mock(side_effect=decode_mock) | |
| def metas(self): | |
| return "" | |
| def supported_devices(self): | |
| return "" | |
| def is_model_loaded(self, speaker_id): | |
| return True | |
| class TestSynthesisEngineBase(TestCase): | |
| def setUp(self): | |
| super().setUp() | |
| self.synthesis_engine = SynthesisEngine( | |
| core=MockCore(), | |
| ) | |
| self.synthesis_engine._synthesis_impl = Mock() | |
| def create_accent_phrases_test_base(self, text: str, expected: List[AccentPhrase]): | |
| actual = self.synthesis_engine.create_accent_phrases(text, 1) | |
| self.assertEqual( | |
| expected, | |
| actual, | |
| "case(text:" + text + ")", | |
| ) | |
| def create_synthesis_test_base( | |
| self, | |
| text: str, | |
| expected: List[AccentPhrase], | |
| enable_interrogative_upspeak: bool, | |
| ): | |
| """音声合成時に疑問文モーラ処理を行っているかどうかを検証 | |
| (https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866) | |
| """ | |
| accent_phrases = self.synthesis_engine.create_accent_phrases(text, 1) | |
| query = create_mock_query(accent_phrases=accent_phrases) | |
| self.synthesis_engine.synthesis( | |
| query, 0, enable_interrogative_upspeak=enable_interrogative_upspeak | |
| ) | |
| # _synthesis_implの第一引数に与えられたqueryを検証 | |
| actual = self.synthesis_engine._synthesis_impl.call_args[0][0].accent_phrases | |
| self.assertEqual( | |
| expected, | |
| actual, | |
| "case(text:" + text + ")", | |
| ) | |
| def test_create_accent_phrases(self): | |
| """accent_phrasesの作成時では疑問文モーラ処理を行わない | |
| (https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866) | |
| """ | |
| expected = koreha_arimasuka_base_expected() | |
| expected[-1].is_interrogative = True | |
| self.create_accent_phrases_test_base(text="これはありますか?", expected=expected) | |
| def test_synthesis_interrogative(self): | |
| expected = koreha_arimasuka_base_expected() | |
| expected[-1].is_interrogative = True | |
| expected[-1].moras += [ | |
| Mora( | |
| text="ア", | |
| consonant=None, | |
| consonant_length=None, | |
| vowel="a", | |
| vowel_length=0.15, | |
| pitch=expected[-1].moras[-1].pitch + 0.3, | |
| ) | |
| ] | |
| self.create_synthesis_test_base( | |
| text="これはありますか?", | |
| expected=expected, | |
| enable_interrogative_upspeak=True, | |
| ) | |
| expected = koreha_arimasuka_base_expected() | |
| expected[-1].is_interrogative = True | |
| self.create_synthesis_test_base( | |
| text="これはありますか?", | |
| expected=expected, | |
| enable_interrogative_upspeak=False, | |
| ) | |
| expected = koreha_arimasuka_base_expected() | |
| self.create_synthesis_test_base( | |
| text="これはありますか", | |
| expected=expected, | |
| enable_interrogative_upspeak=True, | |
| ) | |
| def nn_base_expected(): | |
| return [ | |
| AccentPhrase( | |
| moras=[ | |
| Mora( | |
| text="ン", | |
| consonant=None, | |
| consonant_length=None, | |
| vowel="N", | |
| vowel_length=1.25, | |
| pitch=1.44, | |
| ) | |
| ], | |
| accent=1, | |
| pause_mora=None, | |
| is_interrogative=False, | |
| ) | |
| ] | |
| expected = nn_base_expected() | |
| self.create_synthesis_test_base( | |
| text="ん", | |
| expected=expected, | |
| enable_interrogative_upspeak=True, | |
| ) | |
| expected = nn_base_expected() | |
| expected[-1].is_interrogative = True | |
| expected[-1].moras += [ | |
| Mora( | |
| text="ン", | |
| consonant=None, | |
| consonant_length=None, | |
| vowel="N", | |
| vowel_length=0.15, | |
| pitch=expected[-1].moras[-1].pitch + 0.3, | |
| ) | |
| ] | |
| self.create_synthesis_test_base( | |
| text="ん?", | |
| expected=expected, | |
| enable_interrogative_upspeak=True, | |
| ) | |
| expected = nn_base_expected() | |
| expected[-1].is_interrogative = True | |
| self.create_synthesis_test_base( | |
| text="ん?", | |
| expected=expected, | |
| enable_interrogative_upspeak=False, | |
| ) | |
| def ltu_base_expected(): | |
| return [ | |
| AccentPhrase( | |
| moras=[ | |
| Mora( | |
| text="ッ", | |
| consonant=None, | |
| consonant_length=None, | |
| vowel="cl", | |
| vowel_length=1.69, | |
| pitch=0.0, | |
| ) | |
| ], | |
| accent=1, | |
| pause_mora=None, | |
| is_interrogative=False, | |
| ) | |
| ] | |
| expected = ltu_base_expected() | |
| self.create_synthesis_test_base( | |
| text="っ", | |
| expected=expected, | |
| enable_interrogative_upspeak=True, | |
| ) | |
| expected = ltu_base_expected() | |
| expected[-1].is_interrogative = True | |
| self.create_synthesis_test_base( | |
| text="っ?", | |
| expected=expected, | |
| enable_interrogative_upspeak=True, | |
| ) | |
| expected = ltu_base_expected() | |
| expected[-1].is_interrogative = True | |
| self.create_synthesis_test_base( | |
| text="っ?", | |
| expected=expected, | |
| enable_interrogative_upspeak=False, | |
| ) | |
| def su_base_expected(): | |
| return [ | |
| AccentPhrase( | |
| moras=[ | |
| Mora( | |
| text="ス", | |
| consonant="s", | |
| consonant_length=3.19, | |
| vowel="u", | |
| vowel_length=3.5, | |
| pitch=5.94, | |
| ) | |
| ], | |
| accent=1, | |
| pause_mora=None, | |
| is_interrogative=False, | |
| ) | |
| ] | |
| expected = su_base_expected() | |
| self.create_synthesis_test_base( | |
| text="す", | |
| expected=expected, | |
| enable_interrogative_upspeak=True, | |
| ) | |
| expected = su_base_expected() | |
| expected[-1].is_interrogative = True | |
| expected[-1].moras += [ | |
| Mora( | |
| text="ウ", | |
| consonant=None, | |
| consonant_length=None, | |
| vowel="u", | |
| vowel_length=0.15, | |
| pitch=expected[-1].moras[-1].pitch + 0.3, | |
| ) | |
| ] | |
| self.create_synthesis_test_base( | |
| text="す?", | |
| expected=expected, | |
| enable_interrogative_upspeak=True, | |
| ) | |
| expected = su_base_expected() | |
| expected[-1].is_interrogative = True | |
| self.create_synthesis_test_base( | |
| text="す?", | |
| expected=expected, | |
| enable_interrogative_upspeak=False, | |
| ) | |