Spaces:

crlotwhite
/

UTAU-WebUI

Running

File size: 9,063 Bytes
import tempfile
import numpy as np
import soundfile as sf
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
import logging
from straycat import Resampler
from voice_data_converter import CompressedVoicebankManager, OtoEntry

logger = logging.getLogger(__name__)

class CompressedUTAUEngine:
    """압축된 HDF5 보이스뱅크를 사용하는 UTAU 호환 음성 합성 엔진"""

    def __init__(self, compressed_voicebank_path: Union[str, Path]):
        self.voicebank = CompressedVoicebankManager(compressed_voicebank_path)
        self.default_phoneme = "あ"  # 기본 음소
        logger.info(f"압축된 UTAU 엔진 초기화 완료")

    def synthesize_sequence(self,
                          notes: List[Dict],
                          lyrics: List[str],
                          tempo: int = 120,
                          volume: int = 100) -> Tuple[Optional[str], str]:
        """노트 시퀀스와 가사로 음성 합성"""

        if len(notes) != len(lyrics):
            return None, "노트와 가사의 개수가 일치하지 않습니다."

        if not notes:
            return None, "합성할 노트가 없습니다."

        try:
            # 전체 시퀀스 길이 계산
            max_end_time = max(note.get('endSeconds',
                                      note.get('startSeconds', 0) + note.get('durationSeconds', 0.5))
                              for note in notes)

            sample_rate = 44100
            total_samples = int(max_end_time * sample_rate) + sample_rate
            final_audio = np.zeros(total_samples)

            # 각 노트 합성
            for i, (note, lyric) in enumerate(zip(notes, lyrics)):
                try:
                    # 음소 변환
                    phoneme = self._lyric_to_phoneme(lyric)

                    # oto 엔트리 찾기
                    oto_entry = self.voicebank.get_sample_for_phoneme(phoneme)
                    if not oto_entry:
                        logger.warning(f"음소 '{phoneme}'에 해당하는 샘플을 찾을 수 없음")
                        continue

                    # 오디오 데이터 로드 (압축된 데이터에서)
                    audio_result = self.voicebank.get_audio_data(oto_entry.filename)
                    if not audio_result:
                        logger.warning(f"오디오 파일 로드 실패: {oto_entry.filename}")
                        continue

                    source_audio, source_sample_rate = audio_result

                    # 노트 합성
                    synth_audio = self._synthesize_note(
                        note, oto_entry, source_audio, source_sample_rate, tempo, volume
                    )

                    if synth_audio is not None:
                        # 시간 위치 계산 및 오디오 배치
                        start_sample = int(note.get('startSeconds', 0) * sample_rate)
                        end_sample = start_sample + len(synth_audio)

                        if end_sample <= len(final_audio):
                            final_audio[start_sample:end_sample] += synth_audio * (note.get('velocity', 100) / 100)
                        else:
                            # 버퍼 확장
                            new_size = end_sample + sample_rate
                            new_final_audio = np.zeros(new_size)
                            new_final_audio[:len(final_audio)] = final_audio
                            new_final_audio[start_sample:end_sample] += synth_audio * (note.get('velocity', 100) / 100)
                            final_audio = new_final_audio

                        logger.info(f"노트 {i+1} 합성 완료: {phoneme}")

                except Exception as e:
                    logger.error(f"노트 {i+1} 합성 실패: {e}")
                    continue

            # 최종 오디오 정규화
            if np.max(np.abs(final_audio)) > 0:
                final_audio = final_audio / np.max(np.abs(final_audio)) * 0.85

            # 임시 파일 저장
            output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
            sf.write(output_file.name, final_audio, sample_rate)
            output_file.close()

            duration_sec = len(final_audio) / sample_rate
            return output_file.name, f"✅ 압축된 보이스뱅크로 합성 완료: {len(notes)}개 노트, {duration_sec:.1f}초"

        except Exception as e:
            logger.error(f"시퀀스 합성 실패: {e}")
            return None, f"❌ 합성 실패: {str(e)}"

    def _lyric_to_phoneme(self, lyric: str) -> str:
        """가사를 음소로 변환 (기존 로직과 동일)"""
        lyric = lyric.strip()
        if not lyric:
            return self.default_phoneme

        # 한글 → 일본어 음소 변환 (간단한 매핑)
        hangul_to_japanese = {
            '가': 'ka', '나': 'na', '다': 'da', '라': 'ra', '마': 'ma',
            '바': 'ba', '사': 'sa', '아': 'a', '자': 'za', '차': 'cha',
            '카': 'ka', '타': 'ta', '파': 'pa', '하': 'ha',
            '거': 'ke', '너': 'ne', '더': 'de', '러': 're', '머': 'me',
            '버': 'be', '서': 'se', '어': 'e', '저': 'ze', '처': 'che',
            '커': 'ke', '터': 'te', '퍼': 'pe', '허': 'he',
            '고': 'ko', '노': 'no', '도': 'do', '로': 'ro', '모': 'mo',
            '보': 'bo', '소': 'so', '오': 'o', '조': 'zo', '초': 'cho',
            '코': 'ko', '토': 'to', '포': 'po', '호': 'ho',
            '구': 'ku', '누': 'nu', '두': 'du', '루': 'ru', '무': 'mu',
            '부': 'bu', '수': 'su', '우': 'u', '주': 'zu', '추': 'chu',
            '쿠': 'ku', '투': 'tu', '푸': 'pu', '후': 'hu',
            '기': 'ki', '니': 'ni', '디': 'di', '리': 'ri', '미': 'mi',
            '비': 'bi', '시': 'si', '이': 'i', '지': 'zi', '치': 'chi',
            '키': 'ki', '티': 'ti', '피': 'pi', '히': 'hi',
            '도': 'do', '레': 're', '미': 'mi', '파': 'pa', '솔': 'so', '라': 'ra', '시': 'si'
        }

        if lyric in hangul_to_japanese:
            return hangul_to_japanese[lyric]

        return lyric if lyric in self.voicebank.oto_entries else self.default_phoneme

    def _synthesize_note(self,
                        note: Dict,
                        oto_entry: OtoEntry,
                        source_audio: np.ndarray,
                        source_sample_rate: int,
                        tempo: int,
                        volume: int) -> Optional[np.ndarray]:
        """개별 노트 합성 (압축된 오디오 데이터 사용)"""

        try:
            # 임시 파일에 원본 오디오 저장
            temp_input = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
            sf.write(temp_input.name, source_audio, source_sample_rate)
            temp_input.close()

            # 출력 파일
            temp_output = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
            temp_output.close()

            # 노트 정보 추출
            pitch = note['pitch']
            duration_ms = note.get('durationSeconds', 0.5) * 1000
            velocity = note.get('velocity', 100)

            # MIDI 노트를 음계명으로 변환
            note_name = self._midi_to_note_name(pitch)

            # straycat Resampler로 합성
            resampler = Resampler(
                in_file=temp_input.name,
                out_file=temp_output.name,
                pitch=note_name,
                velocity=velocity,
                length=max(duration_ms, 200),  # 최소 200ms
                volume=volume,
                offset=oto_entry.offset,
                consonant=oto_entry.consonant,
                cutoff=oto_entry.cutoff,
                modulation=10,
                tempo=f'!{tempo}'
            )

            # 합성된 오디오 로드
            if Path(temp_output.name).exists():
                synth_audio, _ = sf.read(temp_output.name)

                # 정리
                Path(temp_input.name).unlink(missing_ok=True)
                Path(temp_output.name).unlink(missing_ok=True)

                return synth_audio
            else:
                logger.error(f"합성된 파일이 생성되지 않음: {temp_output.name}")
                return None

        except Exception as e:
            logger.error(f"노트 합성 실패: {e}")
            return None

    def _midi_to_note_name(self, midi_note: int) -> str:
        """MIDI 노트 번호를 음계명으로 변환"""
        notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
        octave = (midi_note // 12) - 1
        note = notes[midi_note % 12]
        return f"{note}{octave}"

    def get_available_phonemes(self) -> List[str]:
        """사용 가능한 음소 목록 반환"""
        return self.voicebank.list_available_phonemes()

    def get_compression_info(self) -> Dict[str, any]:
        """압축 정보 반환"""
        return self.voicebank.get_compression_info()