Spaces:

crlotwhite
/

UTAU-WebUI

Running

File size: 19,787 Bytes
import os
import re
import logging
import tempfile
import numpy as np
import soundfile as sf
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from dataclasses import dataclass
from straycat import Resampler

# 로깅 설정
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class OtoEntry:
    """UTAU oto.ini 엔트리 클래스"""
    filename: str          # WAV 파일명
    alias: str            # 별명 (발음)
    offset: float         # 오프셋 (ms)
    consonant: float      # 자음 길이 (ms)  
    cutoff: float         # 컷오프 (ms)
    preutterance: float   # 프리유터런스 (ms)
    overlap: float        # 오버랩 (ms)
    
    @property
    def is_breath(self) -> bool:
        """숨소리/무음 구간인지 확인"""
        return self.alias.startswith('-') or self.alias.startswith('*')
    
    @property
    def clean_alias(self) -> str:
        """접두사 제거된 순수 별명"""
        alias = self.alias
        if alias.startswith('- '):
            return alias[2:]
        elif alias.startswith('* '):
            return alias[2:]
        elif alias.startswith('-'):
            return alias[1:]
        elif alias.startswith('*'):
            return alias[1:]
        return alias

class VoicebankManager:
    """UTAU 보이스뱅크 관리 클래스"""
    
    def __init__(self, voicebank_path: Union[str, Path]):
        self.voicebank_path = Path(voicebank_path)
        self.oto_entries: Dict[str, OtoEntry] = {}
        self.wav_files: Dict[str, Path] = {}
        self.load_voicebank()
    
    def load_voicebank(self):
        """보이스뱅크 로드"""
        if not self.voicebank_path.exists():
            raise FileNotFoundError(f"보이스뱅크 경로를 찾을 수 없습니다: {self.voicebank_path}")
        
        # oto.ini 파일 찾기
        oto_file = self.voicebank_path / "oto.ini"
        if not oto_file.exists():
            raise FileNotFoundError(f"oto.ini 파일을 찾을 수 없습니다: {oto_file}")
        
        # WAV 파일들 인덱싱
        self._index_wav_files()
        
        # oto.ini 파싱
        self._parse_oto_ini(oto_file)
        
        logger.info(f"보이스뱅크 로드 완료: {len(self.oto_entries)}개 엔트리, {len(self.wav_files)}개 WAV 파일")
    
    def _index_wav_files(self):
        """WAV 파일들 인덱싱"""
        for wav_file in self.voicebank_path.glob("*.wav"):
            self.wav_files[wav_file.name] = wav_file
        
        # 하위 폴더도 검색
        for subfolder in self.voicebank_path.iterdir():
            if subfolder.is_dir():
                for wav_file in subfolder.glob("*.wav"):
                    self.wav_files[wav_file.name] = wav_file
    
    def _parse_oto_ini(self, oto_file: Path):
        """oto.ini 파일 파싱"""
        try:
            # 다양한 인코딩으로 시도
            encodings = ['shift_jis', 'utf-8', 'cp932', 'euc-jp']
            content = None
            
            for encoding in encodings:
                try:
                    with open(oto_file, 'r', encoding=encoding) as f:
                        content = f.read()
                    logger.info(f"oto.ini를 {encoding} 인코딩으로 읽었습니다.")
                    break
                except UnicodeDecodeError:
                    continue
            
            if content is None:
                raise Exception("oto.ini 파일을 읽을 수 없습니다. 인코딩 문제가 있을 수 있습니다.")
            
            # 각 라인 파싱
            for line_num, line in enumerate(content.strip().split('\n'), 1):
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                
                try:
                    self._parse_oto_line(line)
                except Exception as e:
                    logger.warning(f"oto.ini {line_num}번째 줄 파싱 실패: {e}")
                    continue
                    
        except Exception as e:
            logger.error(f"oto.ini 파싱 실패: {e}")
            raise
    
    def _parse_oto_line(self, line: str):
        """oto.ini 한 줄 파싱"""
        # 형식: filename=alias,offset,consonant,cutoff,preutterance,overlap
        if '=' not in line:
            return
            
        filename, params = line.split('=', 1)
        parts = params.split(',')
        
        if len(parts) != 6:
            logger.warning(f"잘못된 oto.ini 형식: {line}")
            return
        
        try:
            alias = parts[0]
            offset = float(parts[1])
            consonant = float(parts[2]) 
            cutoff = float(parts[3])
            preutterance = float(parts[4])
            overlap = float(parts[5])
            
            entry = OtoEntry(
                filename=filename,
                alias=alias,
                offset=offset,
                consonant=consonant,
                cutoff=cutoff,
                preutterance=preutterance,
                overlap=overlap
            )
            
            self.oto_entries[alias] = entry
            
        except ValueError as e:
            logger.warning(f"oto.ini 파라미터 파싱 실패: {line} - {e}")
    
    def get_sample_for_phoneme(self, phoneme: str) -> Optional[OtoEntry]:
        """음소에 해당하는 샘플 찾기"""
        # 정확한 매치 먼저 시도
        if phoneme in self.oto_entries:
            return self.oto_entries[phoneme]
        
        # 유사한 발음 찾기
        candidates = []
        for alias in self.oto_entries:
            entry = self.oto_entries[alias]
            if entry.clean_alias == phoneme:
                candidates.append(entry)
        
        if candidates:
            # 숨소리가 아닌 것을 우선
            non_breath = [c for c in candidates if not c.is_breath]
            return non_breath[0] if non_breath else candidates[0]
        
        return None
    
    def get_wav_path(self, filename: str) -> Optional[Path]:
        """WAV 파일 경로 가져오기"""
        return self.wav_files.get(filename)
    
    def list_available_phonemes(self) -> List[str]:
        """사용 가능한 음소 목록"""
        return list(set(entry.clean_alias for entry in self.oto_entries.values()))

class UTAUEngine:
    """UTAU 호환 음성 합성 엔진"""
    
    def __init__(self, voicebank_path: Union[str, Path]):
        self.voicebank = VoicebankManager(voicebank_path)
        self.default_phoneme = "あ"  # 기본 음소
    
    def synthesize_sequence(self, 
                          notes: List[Dict], 
                          lyrics: List[str],
                          tempo: int = 120,
                          volume: int = 100) -> Tuple[Optional[str], str]:
        """노트 시퀀스와 가사로 음성 합성"""
        
        if len(notes) != len(lyrics):
            return None, "노트와 가사의 개수가 일치하지 않습니다."
        
        if not notes:
            return None, "노트가 없습니다."
        
        try:
            # 전체 길이 계산 - 초 단위로 계산
            max_end_time_seconds = max(note.get('endSeconds', note.get('startSeconds', 0) + note.get('durationSeconds', 0.5)) for note in notes)
            max_end_time = max_end_time_seconds * 1000  # 밀리초로 변환
            sample_rate = 44100
            total_samples = int(max_end_time * sample_rate / 1000) + sample_rate
            final_audio = np.zeros(total_samples)
            
            synthesized_count = 0
            
            for i, (note, lyric) in enumerate(zip(notes, lyrics)):
                try:
                    # 음소로 변환 (간단한 일본어 음소 매핑)
                    phoneme = self._lyric_to_phoneme(lyric)
                    
                    # 보이스뱅크에서 샘플 찾기
                    oto_entry = self.voicebank.get_sample_for_phoneme(phoneme)
                    if not oto_entry:
                        logger.warning(f"음소 '{phoneme}' (가사: '{lyric}')에 해당하는 샘플을 찾을 수 없습니다.")
                        continue
                    
                    # WAV 파일 경로
                    wav_path = self.voicebank.get_wav_path(oto_entry.filename)
                    if not wav_path or not wav_path.exists():
                        logger.warning(f"WAV 파일을 찾을 수 없습니다: {oto_entry.filename}")
                        continue
                    
                    # 음성 합성
                    synth_audio = self._synthesize_note(note, oto_entry, wav_path, tempo, volume)
                    if synth_audio is not None:
                        # 오디오 믹싱
                        start_sample = int(note.get('startSeconds', 0) * sample_rate)  # 초 단위를 샘플로 변환
                        end_sample = start_sample + len(synth_audio)
                        
                        if end_sample <= len(final_audio):
                            final_audio[start_sample:end_sample] += synth_audio
                        else:
                            # 버퍼 확장
                            new_size = end_sample + sample_rate
                            new_final_audio = np.zeros(new_size)
                            new_final_audio[:len(final_audio)] = final_audio
                            new_final_audio[start_sample:end_sample] += synth_audio
                            final_audio = new_final_audio
                        
                        synthesized_count += 1
                        logger.info(f"노트 {i+1} 합성 완료: {lyric} -> {phoneme}")
                
                except Exception as e:
                    logger.error(f"노트 {i+1} 합성 실패: {e}")
                    continue
            
            if synthesized_count == 0:
                return None, "합성된 노트가 없습니다."
            
            # 최종 오디오 정규화
            if np.max(np.abs(final_audio)) > 0:
                final_audio = final_audio / np.max(np.abs(final_audio)) * 0.8
            
            # 파일 저장
            output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
            sf.write(output_file.name, final_audio, sample_rate)
            output_file.close()
            
            duration_sec = len(final_audio) / sample_rate
            return output_file.name, f"✅ UTAU 합성 완료: {synthesized_count}/{len(notes)}개 노트, {duration_sec:.1f}초"
            
        except Exception as e:
            error_msg = f"❌ UTAU 합성 중 오류: {str(e)}"
            logger.error(error_msg)
            return None, error_msg
    
    def _lyric_to_phoneme(self, lyric: str) -> str:
        """가사를 음소로 변환 (한국어 + 일본어 매핑)"""
        # 공백 제거
        lyric = lyric.strip()
        
        # 빈 가사면 기본값 반환
        if not lyric:
            return self.default_phoneme
        
        # 한국어 음소 매핑 (hanseol CVC용)
        korean_map = {
            # 기본 모음
            '아': 'a', '이': 'i', '우': 'u', '에': 'e', '오': 'o', '으': 'eu', '어': 'eo',
            # 기본 자음+모음
            '바': 'ba', '비': 'bi', '부': 'bu', '베': 'be', '보': 'bo', '브': 'beu', '버': 'beo',
            '다': 'da', '디': 'di', '두': 'du', '데': 'de', '도': 'do', '드': 'deu', '더': 'deo',
            '가': 'ga', '기': 'gi', '구': 'gu', '게': 'ge', '고': 'go', '그': 'geu', '거': 'geo',
            '하': 'ha', '히': 'hi', '후': 'hu', '헤': 'he', '호': 'ho', '흐': 'heu', '허': 'heo',
            '자': 'ja', '지': 'ji', '주': 'ju', '제': 'je', '조': 'jo', '즈': 'jeu', '저': 'jeo',
            '카': 'ka', '키': 'ki', '쿠': 'ku', '케': 'ke', '코': 'ko', '크': 'keu', '커': 'keo',
            '라': 'la', '리': 'li', '루': 'lu', '레': 'le', '로': 'lo', '르': 'leu', '러': 'leo',
            '마': 'ma', '미': 'mi', '무': 'mu', '메': 'me', '모': 'mo', '므': 'meu', '머': 'meo',
            '나': 'na', '니': 'ni', '누': 'nu', '네': 'ne', '노': 'no', '느': 'neu', '너': 'neo',
            '파': 'pa', '피': 'pi', '푸': 'pu', '페': 'pe', '포': 'po', '프': 'peu', '퍼': 'peo',
            '라': 'ra', '리': 'ri', '루': 'ru', '레': 're', '로': 'ro', '르': 'reu', '러': 'reo',
            '사': 'sa', '시': 'si', '수': 'su', '세': 'se', '소': 'so', '스': 'seu', '서': 'seo',
            '타': 'ta', '티': 'ti', '투': 'tu', '테': 'te', '토': 'to', '트': 'teu', '터': 'teo',
            # 복합모음
            '야': 'ya', '예': 'ye', '여': 'yeo', '요': 'yo', '유': 'yu', '의': 'eui',
            '와': 'wa', '웨': 'we', '위': 'wi', '워': 'weo',
            # 기타 한국어 도레미
            '도': 'do', '레': 're', '미': 'mi', '파': 'fa', '솔': 'so', '라': 'la', '시': 'si'
        }
        
        # 한국어 매핑 시도
        if lyric in korean_map:
            return korean_map[lyric]
        
        # 로마자 -> 히라가나 변환 (일본어)
        romaji_map = {
            'a': 'あ', 'i': 'い', 'u': 'う', 'e': 'え', 'o': 'お',
            'ka': 'か', 'ki': 'き', 'ku': 'く', 'ke': 'け', 'ko': 'こ',
            'sa': 'さ', 'shi': 'し', 'su': 'す', 'se': 'せ', 'so': 'そ',
            'ta': 'た', 'chi': 'ち', 'tsu': 'つ', 'te': 'て', 'to': 'と',
            'na': 'な', 'ni': 'に', 'nu': 'ぬ', 'ne': 'ね', 'no': 'の',
            'ha': 'は', 'hi': 'ひ', 'fu': 'ふ', 'he': 'へ', 'ho': 'ほ',
            'ma': 'ま', 'mi': 'み', 'mu': 'む', 'me': 'め', 'mo': 'も',
            'ya': 'や', 'yu': 'ゆ', 'yo': 'よ',
            'ra': 'ら', 'ri': 'り', 'ru': 'る', 're': 'れ', 'ro': 'ろ',
            'wa': 'わ', 'wo': 'を', 'n': 'ん'
        }
        
        # 로마자 변환 시도
        lyric_lower = lyric.lower()
        if lyric_lower in romaji_map:
            return romaji_map[lyric_lower]
        
        # 이미 음소인 경우 (hanseol CVC 직접 입력)
        available_phonemes = self.voicebank.list_available_phonemes()
        if lyric in available_phonemes:
            return lyric
        
        # 기본값 반환
        logger.warning(f"알 수 없는 가사: '{lyric}', 기본 음소 '{self.default_phoneme}' 사용")
        return self.default_phoneme
    
    def _synthesize_note(self, 
                        note: Dict, 
                        oto_entry: OtoEntry, 
                        wav_path: Path,
                        tempo: int,
                        volume: int) -> Optional[np.ndarray]:
        """개별 노트 합성"""
        try:
            # MIDI 노트를 노트 이름으로 변환
            note_name = self._midi_to_note_name(note['pitch'])
            
            # 노트 길이 검증 및 조정
            min_duration = 200  # 최소 200ms
            duration = max(note.get('durationSeconds', 0.5) * 1000, min_duration)  # 초를 밀리초로 변환
            
            # 임시 출력 파일
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
                temp_output = temp_file.name
            
            try:
                # UTAU 파라미터 검증 및 조정
                offset = max(oto_entry.offset, 0)  # 음수 오프셋 방지
                consonant = max(oto_entry.consonant if oto_entry.consonant > 0 else 50, 10)  # 최소 10ms
                cutoff = max(oto_entry.cutoff if oto_entry.cutoff > 0 else 0, 0)
                
                # WAV 파일 길이 확인
                try:
                    info = sf.info(wav_path)
                    wav_duration_ms = (info.frames / info.samplerate) * 1000
                    
                    # 오프셋이 WAV 파일보다 긴 경우 조정
                    if offset >= wav_duration_ms - 100:  # 100ms 여유
                        offset = max(0, wav_duration_ms - 200)
                        logger.warning(f"오프셋이 너무 큽니다. {offset}ms로 조정했습니다.")
                    
                    # 자음 길이가 너무 긴 경우 조정
                    max_consonant = min(duration / 2, wav_duration_ms - offset - 50)
                    consonant = min(consonant, max_consonant)
                    
                except Exception as e:
                    logger.warning(f"WAV 파일 정보 확인 실패: {e}")
                
                # 최소 길이 보장
                if consonant < 10:
                    consonant = 10
                if duration < consonant + 50:
                    duration = consonant + 50
                
                logger.info(f"합성 파라미터: offset={offset:.1f}ms, consonant={consonant:.1f}ms, duration={duration:.1f}ms")
                
                # straycat으로 합성
                resampler = Resampler(
                    in_file=str(wav_path),
                    out_file=temp_output,
                    pitch=note_name,
                    velocity=note.get('velocity', 100),
                    length=duration,
                    volume=volume,
                    flags='',
                    offset=offset,
                    consonant=consonant,
                    cutoff=cutoff,
                    modulation=0,
                    tempo=f'!{tempo}'
                )
                
                # 합성된 오디오 로드
                if os.path.exists(temp_output):
                    synth_audio, _ = sf.read(temp_output)
                    
                    # 결과 검증
                    if len(synth_audio) == 0:
                        logger.warning("합성된 오디오가 비어있습니다.")
                        return None
                    
                    return synth_audio
                else:
                    logger.warning("합성 결과 파일이 생성되지 않았습니다.")
                    return None
                
            except Exception as e:
                logger.error(f"straycat 합성 실패: {e}")
                return None
            
            finally:
                # 임시 파일 정리
                if os.path.exists(temp_output):
                    try:
                        os.unlink(temp_output)
                    except:
                        pass
                    
        except Exception as e:
            logger.error(f"노트 합성 실패: {e}")
            return None
        
        return None
    
    def _midi_to_note_name(self, midi_note: int) -> str:
        """MIDI 노트를 노트 이름으로 변환"""
        notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
        octave = (midi_note // 12) - 1
        note = notes[midi_note % 12]
        return f"{note}{octave}"
    
    def get_available_phonemes(self) -> List[str]:
        """사용 가능한 음소 목록 반환"""
        return self.voicebank.list_available_phonemes()

# 테스트 함수
def test_utau_engine():
    """UTAU 엔진 테스트"""
    try:
        voicebank_path = "voice/hanseol CVC"
        engine = UTAUEngine(voicebank_path)
        
        print(f"hanseol CVC 보이스뱅크 로드 완료!")
        print(f"사용 가능한 음소: {len(engine.get_available_phonemes())}개")
        print(f"첫 10개 음소: {engine.get_available_phonemes()[:10]}")
        
        return engine
        
    except Exception as e:
        print(f"UTAU 엔진 테스트 실패: {e}")
        return None

if __name__ == "__main__":
    test_utau_engine()