| import math |
| from copy import deepcopy |
| from random import random |
| from typing import Union |
| from unittest import TestCase |
| from unittest.mock import Mock |
|
|
| import numpy |
|
|
| from voicevox_engine.acoustic_feature_extractor import OjtPhoneme |
| from voicevox_engine.model import AccentPhrase, AudioQuery, Mora |
| from voicevox_engine.synthesis_engine import SynthesisEngine |
|
|
| |
| from voicevox_engine.synthesis_engine.synthesis_engine import ( |
| mora_phoneme_list, |
| pre_process, |
| split_mora, |
| to_flatten_moras, |
| to_phoneme_data_list, |
| unvoiced_mora_phoneme_list, |
| ) |
|
|
|
|
| def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, speaker_id: numpy.ndarray): |
| result = [] |
| |
| for i in range(length): |
| result.append(float(phoneme_list[i] * 0.5 + speaker_id)) |
| return numpy.array(result) |
|
|
|
|
| def yukarin_sa_mock( |
| length: int, |
| vowel_phoneme_list: numpy.ndarray, |
| consonant_phoneme_list: numpy.ndarray, |
| start_accent_list: numpy.ndarray, |
| end_accent_list: numpy.ndarray, |
| start_accent_phrase_list: numpy.ndarray, |
| end_accent_phrase_list: numpy.ndarray, |
| speaker_id: numpy.ndarray, |
| ): |
| result = [] |
| |
| for i in range(length): |
| result.append( |
| float( |
| ( |
| vowel_phoneme_list[0][i] |
| + consonant_phoneme_list[0][i] |
| + start_accent_list[0][i] |
| + end_accent_list[0][i] |
| + start_accent_phrase_list[0][i] |
| + end_accent_phrase_list[0][i] |
| ) |
| * 0.5 |
| + speaker_id |
| ) |
| ) |
| return numpy.array(result)[numpy.newaxis] |
|
|
|
|
| def decode_mock( |
| length: int, |
| phoneme_size: int, |
| f0: numpy.ndarray, |
| phoneme: numpy.ndarray, |
| speaker_id: Union[numpy.ndarray, int], |
| ): |
| result = [] |
| |
| for i in range(length): |
| |
| for _ in range(256): |
| result.append( |
| float( |
| f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size) |
| + speaker_id |
| ) |
| ) |
| return numpy.array(result) |
|
|
|
|
| class MockCore: |
| yukarin_s_forward = Mock(side_effect=yukarin_s_mock) |
| yukarin_sa_forward = Mock(side_effect=yukarin_sa_mock) |
| decode_forward = Mock(side_effect=decode_mock) |
|
|
| def metas(self): |
| return "" |
|
|
| def supported_devices(self): |
| return "" |
|
|
| def is_model_loaded(self, speaker_id): |
| return True |
|
|
|
|
| class TestSynthesisEngine(TestCase): |
| def setUp(self): |
| super().setUp() |
| self.str_list_hello_hiho = ( |
| "sil k o N n i ch i w a pau h i h o d e s U sil".split() |
| ) |
| self.phoneme_data_list_hello_hiho = [ |
| OjtPhoneme(phoneme=p, start=i, end=i + 1) |
| for i, p in enumerate( |
| "pau k o N n i ch i w a pau h i h o d e s U pau".split() |
| ) |
| ] |
| self.accent_phrases_hello_hiho = [ |
| AccentPhrase( |
| moras=[ |
| Mora( |
| text="コ", |
| consonant="k", |
| consonant_length=0.0, |
| vowel="o", |
| vowel_length=0.0, |
| pitch=0.0, |
| ), |
| Mora( |
| text="ン", |
| consonant=None, |
| consonant_length=None, |
| vowel="N", |
| vowel_length=0.0, |
| pitch=0.0, |
| ), |
| Mora( |
| text="ニ", |
| consonant="n", |
| consonant_length=0.0, |
| vowel="i", |
| vowel_length=0.0, |
| pitch=0.0, |
| ), |
| Mora( |
| text="チ", |
| consonant="ch", |
| consonant_length=0.0, |
| vowel="i", |
| vowel_length=0.0, |
| pitch=0.0, |
| ), |
| Mora( |
| text="ワ", |
| consonant="w", |
| consonant_length=0.0, |
| vowel="a", |
| vowel_length=0.0, |
| pitch=0.0, |
| ), |
| ], |
| accent=5, |
| pause_mora=Mora( |
| text="、", |
| consonant=None, |
| consonant_length=None, |
| vowel="pau", |
| vowel_length=0.0, |
| pitch=0.0, |
| ), |
| ), |
| AccentPhrase( |
| moras=[ |
| Mora( |
| text="ヒ", |
| consonant="h", |
| consonant_length=0.0, |
| vowel="i", |
| vowel_length=0.0, |
| pitch=0.0, |
| ), |
| Mora( |
| text="ホ", |
| consonant="h", |
| consonant_length=0.0, |
| vowel="o", |
| vowel_length=0.0, |
| pitch=0.0, |
| ), |
| Mora( |
| text="デ", |
| consonant="d", |
| consonant_length=0.0, |
| vowel="e", |
| vowel_length=0.0, |
| pitch=0.0, |
| ), |
| Mora( |
| text="ス", |
| consonant="s", |
| consonant_length=0.0, |
| vowel="U", |
| vowel_length=0.0, |
| pitch=0.0, |
| ), |
| ], |
| accent=1, |
| pause_mora=None, |
| ), |
| ] |
| core = MockCore() |
| self.yukarin_s_mock = core.yukarin_s_forward |
| self.yukarin_sa_mock = core.yukarin_sa_forward |
| self.decode_mock = core.decode_forward |
| self.synthesis_engine = SynthesisEngine( |
| core=core, |
| ) |
|
|
| def test_to_flatten_moras(self): |
| flatten_moras = to_flatten_moras(self.accent_phrases_hello_hiho) |
| self.assertEqual( |
| flatten_moras, |
| self.accent_phrases_hello_hiho[0].moras |
| + [self.accent_phrases_hello_hiho[0].pause_mora] |
| + self.accent_phrases_hello_hiho[1].moras, |
| ) |
|
|
| def test_to_phoneme_data_list(self): |
| phoneme_data_list = to_phoneme_data_list(self.str_list_hello_hiho) |
| self.assertEqual(phoneme_data_list, self.phoneme_data_list_hello_hiho) |
|
|
| def test_split_mora(self): |
| consonant_phoneme_list, vowel_phoneme_list, vowel_indexes = split_mora( |
| self.phoneme_data_list_hello_hiho |
| ) |
|
|
| self.assertEqual(vowel_indexes, [0, 2, 3, 5, 7, 9, 10, 12, 14, 16, 18, 19]) |
| self.assertEqual( |
| vowel_phoneme_list, |
| [ |
| OjtPhoneme(phoneme="pau", start=0, end=1), |
| OjtPhoneme(phoneme="o", start=2, end=3), |
| OjtPhoneme(phoneme="N", start=3, end=4), |
| OjtPhoneme(phoneme="i", start=5, end=6), |
| OjtPhoneme(phoneme="i", start=7, end=8), |
| OjtPhoneme(phoneme="a", start=9, end=10), |
| OjtPhoneme(phoneme="pau", start=10, end=11), |
| OjtPhoneme(phoneme="i", start=12, end=13), |
| OjtPhoneme(phoneme="o", start=14, end=15), |
| OjtPhoneme(phoneme="e", start=16, end=17), |
| OjtPhoneme(phoneme="U", start=18, end=19), |
| OjtPhoneme(phoneme="pau", start=19, end=20), |
| ], |
| ) |
| self.assertEqual( |
| consonant_phoneme_list, |
| [ |
| None, |
| OjtPhoneme(phoneme="k", start=1, end=2), |
| None, |
| OjtPhoneme(phoneme="n", start=4, end=5), |
| OjtPhoneme(phoneme="ch", start=6, end=7), |
| OjtPhoneme(phoneme="w", start=8, end=9), |
| None, |
| OjtPhoneme(phoneme="h", start=11, end=12), |
| OjtPhoneme(phoneme="h", start=13, end=14), |
| OjtPhoneme(phoneme="d", start=15, end=16), |
| OjtPhoneme(phoneme="s", start=17, end=18), |
| None, |
| ], |
| ) |
|
|
| def test_pre_process(self): |
| flatten_moras, phoneme_data_list = pre_process( |
| deepcopy(self.accent_phrases_hello_hiho) |
| ) |
|
|
| mora_index = 0 |
| phoneme_index = 1 |
|
|
| self.assertEqual(phoneme_data_list[0], OjtPhoneme("pau", 0, 1)) |
| for accent_phrase in self.accent_phrases_hello_hiho: |
| moras = accent_phrase.moras |
| for mora in moras: |
| self.assertEqual(flatten_moras[mora_index], mora) |
| mora_index += 1 |
| if mora.consonant is not None: |
| self.assertEqual( |
| phoneme_data_list[phoneme_index], |
| OjtPhoneme(mora.consonant, phoneme_index, phoneme_index + 1), |
| ) |
| phoneme_index += 1 |
| self.assertEqual( |
| phoneme_data_list[phoneme_index], |
| OjtPhoneme(mora.vowel, phoneme_index, phoneme_index + 1), |
| ) |
| phoneme_index += 1 |
| if accent_phrase.pause_mora: |
| self.assertEqual(flatten_moras[mora_index], accent_phrase.pause_mora) |
| mora_index += 1 |
| self.assertEqual( |
| phoneme_data_list[phoneme_index], |
| OjtPhoneme("pau", phoneme_index, phoneme_index + 1), |
| ) |
| phoneme_index += 1 |
| self.assertEqual( |
| phoneme_data_list[phoneme_index], |
| OjtPhoneme("pau", phoneme_index, phoneme_index + 1), |
| ) |
|
|
| def test_replace_phoneme_length(self): |
| result = self.synthesis_engine.replace_phoneme_length( |
| accent_phrases=deepcopy(self.accent_phrases_hello_hiho), speaker_id=1 |
| ) |
|
|
| |
| yukarin_s_args = self.yukarin_s_mock.call_args[1] |
| list_length = yukarin_s_args["length"] |
| phoneme_list = yukarin_s_args["phoneme_list"] |
| self.assertEqual(list_length, 20) |
| self.assertEqual(list_length, len(phoneme_list)) |
| numpy.testing.assert_array_equal( |
| phoneme_list, |
| numpy.array( |
| [ |
| 0, |
| 23, |
| 30, |
| 4, |
| 28, |
| 21, |
| 10, |
| 21, |
| 42, |
| 7, |
| 0, |
| 19, |
| 21, |
| 19, |
| 30, |
| 12, |
| 14, |
| 35, |
| 6, |
| 0, |
| ], |
| dtype=numpy.int64, |
| ), |
| ) |
| self.assertEqual(yukarin_s_args["speaker_id"], 1) |
|
|
| |
| true_result = deepcopy(self.accent_phrases_hello_hiho) |
| index = 1 |
|
|
| def result_value(i: int): |
| return float(phoneme_list[i] * 0.5 + 1) |
|
|
| for accent_phrase in true_result: |
| moras = accent_phrase.moras |
| for mora in moras: |
| if mora.consonant is not None: |
| mora.consonant_length = result_value(index) |
| index += 1 |
| mora.vowel_length = result_value(index) |
| index += 1 |
| if accent_phrase.pause_mora is not None: |
| accent_phrase.pause_mora.vowel_length = result_value(index) |
| index += 1 |
|
|
| self.assertEqual(result, true_result) |
|
|
| def test_replace_mora_pitch(self): |
| |
| empty_accent_phrases = [] |
| self.assertEqual( |
| self.synthesis_engine.replace_mora_pitch( |
| accent_phrases=empty_accent_phrases, speaker_id=1 |
| ), |
| [], |
| ) |
|
|
| result = self.synthesis_engine.replace_mora_pitch( |
| accent_phrases=deepcopy(self.accent_phrases_hello_hiho), speaker_id=1 |
| ) |
|
|
| |
| yukarin_sa_args = self.yukarin_sa_mock.call_args[1] |
| list_length = yukarin_sa_args["length"] |
| vowel_phoneme_list = yukarin_sa_args["vowel_phoneme_list"][0] |
| consonant_phoneme_list = yukarin_sa_args["consonant_phoneme_list"][0] |
| start_accent_list = yukarin_sa_args["start_accent_list"][0] |
| end_accent_list = yukarin_sa_args["end_accent_list"][0] |
| start_accent_phrase_list = yukarin_sa_args["start_accent_phrase_list"][0] |
| end_accent_phrase_list = yukarin_sa_args["end_accent_phrase_list"][0] |
| self.assertEqual(list_length, 12) |
| self.assertEqual(list_length, len(vowel_phoneme_list)) |
| self.assertEqual(list_length, len(consonant_phoneme_list)) |
| self.assertEqual(list_length, len(start_accent_list)) |
| self.assertEqual(list_length, len(end_accent_list)) |
| self.assertEqual(list_length, len(start_accent_phrase_list)) |
| self.assertEqual(list_length, len(end_accent_phrase_list)) |
| self.assertEqual(yukarin_sa_args["speaker_id"], 1) |
|
|
| numpy.testing.assert_array_equal( |
| vowel_phoneme_list, |
| numpy.array( |
| [ |
| 0, |
| 30, |
| 4, |
| 21, |
| 21, |
| 7, |
| 0, |
| 21, |
| 30, |
| 14, |
| 6, |
| 0, |
| ] |
| ), |
| ) |
| numpy.testing.assert_array_equal( |
| consonant_phoneme_list, |
| numpy.array( |
| [ |
| -1, |
| 23, |
| -1, |
| 28, |
| 10, |
| 42, |
| -1, |
| 19, |
| 19, |
| 12, |
| 35, |
| -1, |
| ] |
| ), |
| ) |
| numpy.testing.assert_array_equal( |
| start_accent_list, numpy.array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]) |
| ) |
| numpy.testing.assert_array_equal( |
| end_accent_list, numpy.array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]) |
| ) |
| numpy.testing.assert_array_equal( |
| start_accent_phrase_list, numpy.array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]) |
| ) |
| numpy.testing.assert_array_equal( |
| end_accent_phrase_list, numpy.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]) |
| ) |
|
|
| |
| true_result = deepcopy(self.accent_phrases_hello_hiho) |
| index = 1 |
|
|
| def result_value(i: int): |
| |
| unvoiced_mora_phoneme_id_list = [ |
| OjtPhoneme(p, 0, 0).phoneme_id for p in unvoiced_mora_phoneme_list |
| ] |
| if vowel_phoneme_list[i] in unvoiced_mora_phoneme_id_list: |
| return 0 |
| return ( |
| vowel_phoneme_list[i] |
| + consonant_phoneme_list[i] |
| + start_accent_list[i] |
| + end_accent_list[i] |
| + start_accent_phrase_list[i] |
| + end_accent_phrase_list[i] |
| ) * 0.5 + 1 |
|
|
| for accent_phrase in true_result: |
| moras = accent_phrase.moras |
| for mora in moras: |
| mora.pitch = result_value(index) |
| index += 1 |
| if accent_phrase.pause_mora is not None: |
| accent_phrase.pause_mora.pitch = result_value(index) |
| index += 1 |
|
|
| self.assertEqual(result, true_result) |
|
|
| def synthesis_test_base(self, audio_query: AudioQuery): |
| accent_phrases = audio_query.accent_phrases |
|
|
| |
| phoneme_length_list = [0.0] |
| phoneme_id_list = [0] |
| f0_list = [0.0] |
| for accent_phrase in accent_phrases: |
| moras = accent_phrase.moras |
| for mora in moras: |
| if mora.consonant is not None: |
| mora.consonant_length = 0.1 |
| phoneme_length_list.append(0.1) |
| phoneme_id_list.append(OjtPhoneme(mora.consonant, 0, 0).phoneme_id) |
| mora.vowel_length = 0.2 |
| phoneme_length_list.append(0.2) |
| phoneme_id_list.append(OjtPhoneme(mora.vowel, 0, 0).phoneme_id) |
| if mora.vowel not in unvoiced_mora_phoneme_list: |
| mora.pitch = 5.0 + random() |
| f0_list.append(mora.pitch) |
| if accent_phrase.pause_mora is not None: |
| accent_phrase.pause_mora.vowel_length = 0.2 |
| phoneme_length_list.append(0.2) |
| phoneme_id_list.append(OjtPhoneme("pau", 0, 0).phoneme_id) |
| f0_list.append(0.0) |
| phoneme_length_list.append(0.0) |
| phoneme_id_list.append(0) |
| f0_list.append(0.0) |
|
|
| phoneme_length_list[0] = audio_query.prePhonemeLength |
| phoneme_length_list[-1] = audio_query.postPhonemeLength |
|
|
| for i in range(len(phoneme_length_list)): |
| phoneme_length_list[i] /= audio_query.speedScale |
|
|
| result = self.synthesis_engine.synthesis(query=audio_query, speaker_id=1) |
|
|
| |
| decode_args = self.decode_mock.call_args[1] |
| list_length = decode_args["length"] |
| self.assertEqual( |
| list_length, |
| int(sum([round(p * 24000 / 256) for p in phoneme_length_list])), |
| ) |
|
|
| num_phoneme = OjtPhoneme.num_phoneme |
| |
| mora_phoneme_id_list = [ |
| OjtPhoneme(p, 0, 0).phoneme_id for p in mora_phoneme_list |
| ] |
|
|
| |
| f0 = [] |
| phoneme = [] |
| f0_index = 0 |
| mean_f0 = [] |
| for i, phoneme_length in enumerate(phoneme_length_list): |
| f0_single = numpy.array(f0_list[f0_index], dtype=numpy.float32) * ( |
| 2**audio_query.pitchScale |
| ) |
| for _ in range(int(round(phoneme_length * (24000 / 256)))): |
| f0.append([f0_single]) |
| phoneme_s = [] |
| for _ in range(num_phoneme): |
| phoneme_s.append(0) |
| |
| phoneme_s[phoneme_id_list[i]] = 1 |
| phoneme.append(phoneme_s) |
| |
| if phoneme_id_list[i] in mora_phoneme_id_list: |
| if f0_single > 0: |
| mean_f0.append(f0_single) |
| f0_index += 1 |
|
|
| mean_f0 = numpy.array(mean_f0, dtype=numpy.float32).mean() |
| f0 = numpy.array(f0, dtype=numpy.float32) |
| for i in range(len(f0)): |
| if f0[i][0] != 0.0: |
| f0[i][0] = (f0[i][0] - mean_f0) * audio_query.intonationScale + mean_f0 |
|
|
| phoneme = numpy.array(phoneme, dtype=numpy.float32) |
|
|
| |
| |
| |
| |
| assert_f0_count = 0 |
| decode_f0 = decode_args["f0"] |
| for i in range(len(decode_f0)): |
| |
| assert_f0_count += math.isclose(f0[i][0], decode_f0[i][0], rel_tol=10e-5) |
| self.assertTrue(assert_f0_count >= int(len(decode_f0) / 5) * 4) |
| assert_phoneme_count = 0 |
| decode_phoneme = decode_args["phoneme"] |
| for i in range(len(decode_phoneme)): |
| assert_true_count = 0 |
| for j in range(len(decode_phoneme[i])): |
| assert_true_count += bool(phoneme[i][j] == decode_phoneme[i][j]) |
| assert_phoneme_count += assert_true_count == num_phoneme |
| self.assertTrue(assert_phoneme_count >= int(len(decode_phoneme) / 5) * 4) |
| self.assertEqual(decode_args["speaker_id"], 1) |
|
|
| |
| true_result = decode_mock(list_length, num_phoneme, f0, phoneme, 1) |
|
|
| true_result *= audio_query.volumeScale |
|
|
| |
| if audio_query.outputSamplingRate != 24000: |
| return |
|
|
| assert_result_count = 0 |
| for i in range(len(true_result)): |
| if audio_query.outputStereo: |
| assert_result_count += math.isclose( |
| true_result[i], result[i][0], rel_tol=10e-5 |
| ) and math.isclose(true_result[i], result[i][1], rel_tol=10e-5) |
| else: |
| assert_result_count += math.isclose( |
| true_result[i], result[i], rel_tol=10e-5 |
| ) |
| self.assertTrue(assert_result_count >= int(len(true_result) / 5) * 4) |
|
|
| def test_synthesis(self): |
| audio_query = AudioQuery( |
| accent_phrases=deepcopy(self.accent_phrases_hello_hiho), |
| speedScale=1.0, |
| pitchScale=1.0, |
| intonationScale=1.0, |
| volumeScale=1.0, |
| prePhonemeLength=0.1, |
| postPhonemeLength=0.1, |
| outputSamplingRate=24000, |
| outputStereo=False, |
| |
| kana="", |
| ) |
|
|
| self.synthesis_test_base(audio_query) |
|
|
| |
| audio_query.speedScale = 1.2 |
| self.synthesis_test_base(audio_query) |
|
|
| |
| audio_query.pitchScale = 1.5 |
| audio_query.speedScale = 1.0 |
| self.synthesis_test_base(audio_query) |
|
|
| |
| audio_query.pitchScale = 1.0 |
| audio_query.intonationScale = 1.4 |
| self.synthesis_test_base(audio_query) |
|
|
| |
| audio_query.intonationScale = 1.0 |
| audio_query.volumeScale = 2.0 |
| self.synthesis_test_base(audio_query) |
|
|
| |
| audio_query.volumeScale = 1.0 |
| audio_query.prePhonemeLength = 0.5 |
| audio_query.postPhonemeLength = 0.5 |
| self.synthesis_test_base(audio_query) |
|
|
| |
| audio_query.prePhonemeLength = 0.1 |
| audio_query.postPhonemeLength = 0.1 |
| audio_query.outputSamplingRate = 48000 |
| self.synthesis_test_base(audio_query) |
|
|
| |
| audio_query.outputSamplingRate = 24000 |
| audio_query.outputStereo = True |
| self.synthesis_test_base(audio_query) |
|
|