Spaces:

crlotwhite
/

UTAU-WebUI

Sleeping

App Files Files Community

UTAU-WebUI / compressed_utau_engine.py

crlotwhite

Add UTAU WebUI project with LFS support for voice files

1056960 7 months ago

raw

history blame contribute delete

9.06 kB

	import tempfile
	import numpy as np
	import soundfile as sf
	from pathlib import Path
	from typing import Dict, List, Optional, Tuple, Union
	import logging
	from straycat import Resampler
	from voice_data_converter import CompressedVoicebankManager, OtoEntry

	logger = logging.getLogger(__name__)

	class CompressedUTAUEngine:
	"""압축된 HDF5 보이스뱅크를 사용하는 UTAU 호환 음성 합성 엔진"""

	def __init__(self, compressed_voicebank_path: Union[str, Path]):
	self.voicebank = CompressedVoicebankManager(compressed_voicebank_path)
	self.default_phoneme = "あ" # 기본 음소
	logger.info(f"압축된 UTAU 엔진 초기화 완료")

	def synthesize_sequence(self,
	notes: List[Dict],
	lyrics: List[str],
	tempo: int = 120,
	volume: int = 100) -> Tuple[Optional[str], str]:
	"""노트 시퀀스와 가사로 음성 합성"""

	if len(notes) != len(lyrics):
	return None, "노트와 가사의 개수가 일치하지 않습니다."

	if not notes:
	return None, "합성할 노트가 없습니다."

	try:
	# 전체 시퀀스 길이 계산
	max_end_time = max(note.get('endSeconds',
	note.get('startSeconds', 0) + note.get('durationSeconds', 0.5))
	for note in notes)

	sample_rate = 44100
	total_samples = int(max_end_time * sample_rate) + sample_rate
	final_audio = np.zeros(total_samples)

	# 각 노트 합성
	for i, (note, lyric) in enumerate(zip(notes, lyrics)):
	try:
	# 음소 변환
	phoneme = self._lyric_to_phoneme(lyric)

	# oto 엔트리 찾기
	oto_entry = self.voicebank.get_sample_for_phoneme(phoneme)
	if not oto_entry:
	logger.warning(f"음소 '{phoneme}'에 해당하는 샘플을 찾을 수 없음")
	continue

	# 오디오 데이터 로드 (압축된 데이터에서)
	audio_result = self.voicebank.get_audio_data(oto_entry.filename)
	if not audio_result:
	logger.warning(f"오디오 파일 로드 실패: {oto_entry.filename}")
	continue

	source_audio, source_sample_rate = audio_result

	# 노트 합성
	synth_audio = self._synthesize_note(
	note, oto_entry, source_audio, source_sample_rate, tempo, volume
	)

	if synth_audio is not None:
	# 시간 위치 계산 및 오디오 배치
	start_sample = int(note.get('startSeconds', 0) * sample_rate)
	end_sample = start_sample + len(synth_audio)

	if end_sample <= len(final_audio):
	final_audio[start_sample:end_sample] += synth_audio * (note.get('velocity', 100) / 100)
	else:
	# 버퍼 확장
	new_size = end_sample + sample_rate
	new_final_audio = np.zeros(new_size)
	new_final_audio[:len(final_audio)] = final_audio
	new_final_audio[start_sample:end_sample] += synth_audio * (note.get('velocity', 100) / 100)
	final_audio = new_final_audio

	logger.info(f"노트 {i+1} 합성 완료: {phoneme}")

	except Exception as e:
	logger.error(f"노트 {i+1} 합성 실패: {e}")
	continue

	# 최종 오디오 정규화
	if np.max(np.abs(final_audio)) > 0:
	final_audio = final_audio / np.max(np.abs(final_audio)) * 0.85

	# 임시 파일 저장
	output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
	sf.write(output_file.name, final_audio, sample_rate)
	output_file.close()

	duration_sec = len(final_audio) / sample_rate
	return output_file.name, f"✅ 압축된 보이스뱅크로 합성 완료: {len(notes)}개 노트, {duration_sec:.1f}초"

	except Exception as e:
	logger.error(f"시퀀스 합성 실패: {e}")
	return None, f"❌ 합성 실패: {str(e)}"

	def _lyric_to_phoneme(self, lyric: str) -> str:
	"""가사를 음소로 변환 (기존 로직과 동일)"""
	lyric = lyric.strip()
	if not lyric:
	return self.default_phoneme

	# 한글 → 일본어 음소 변환 (간단한 매핑)
	hangul_to_japanese = {
	'가': 'ka', '나': 'na', '다': 'da', '라': 'ra', '마': 'ma',
	'바': 'ba', '사': 'sa', '아': 'a', '자': 'za', '차': 'cha',
	'카': 'ka', '타': 'ta', '파': 'pa', '하': 'ha',
	'거': 'ke', '너': 'ne', '더': 'de', '러': 're', '머': 'me',
	'버': 'be', '서': 'se', '어': 'e', '저': 'ze', '처': 'che',
	'커': 'ke', '터': 'te', '퍼': 'pe', '허': 'he',
	'고': 'ko', '노': 'no', '도': 'do', '로': 'ro', '모': 'mo',
	'보': 'bo', '소': 'so', '오': 'o', '조': 'zo', '초': 'cho',
	'코': 'ko', '토': 'to', '포': 'po', '호': 'ho',
	'구': 'ku', '누': 'nu', '두': 'du', '루': 'ru', '무': 'mu',
	'부': 'bu', '수': 'su', '우': 'u', '주': 'zu', '추': 'chu',
	'쿠': 'ku', '투': 'tu', '푸': 'pu', '후': 'hu',
	'기': 'ki', '니': 'ni', '디': 'di', '리': 'ri', '미': 'mi',
	'비': 'bi', '시': 'si', '이': 'i', '지': 'zi', '치': 'chi',
	'키': 'ki', '티': 'ti', '피': 'pi', '히': 'hi',
	'도': 'do', '레': 're', '미': 'mi', '파': 'pa', '솔': 'so', '라': 'ra', '시': 'si'
	}

	if lyric in hangul_to_japanese:
	return hangul_to_japanese[lyric]

	return lyric if lyric in self.voicebank.oto_entries else self.default_phoneme

	def _synthesize_note(self,
	note: Dict,
	oto_entry: OtoEntry,
	source_audio: np.ndarray,
	source_sample_rate: int,
	tempo: int,
	volume: int) -> Optional[np.ndarray]:
	"""개별 노트 합성 (압축된 오디오 데이터 사용)"""

	try:
	# 임시 파일에 원본 오디오 저장
	temp_input = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
	sf.write(temp_input.name, source_audio, source_sample_rate)
	temp_input.close()

	# 출력 파일
	temp_output = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
	temp_output.close()

	# 노트 정보 추출
	pitch = note['pitch']
	duration_ms = note.get('durationSeconds', 0.5) * 1000
	velocity = note.get('velocity', 100)

	# MIDI 노트를 음계명으로 변환
	note_name = self._midi_to_note_name(pitch)

	# straycat Resampler로 합성
	resampler = Resampler(
	in_file=temp_input.name,
	out_file=temp_output.name,
	pitch=note_name,
	velocity=velocity,
	length=max(duration_ms, 200), # 최소 200ms
	volume=volume,
	offset=oto_entry.offset,
	consonant=oto_entry.consonant,
	cutoff=oto_entry.cutoff,
	modulation=10,
	tempo=f'!{tempo}'
	)

	# 합성된 오디오 로드
	if Path(temp_output.name).exists():
	synth_audio, _ = sf.read(temp_output.name)

	# 정리
	Path(temp_input.name).unlink(missing_ok=True)
	Path(temp_output.name).unlink(missing_ok=True)

	return synth_audio
	else:
	logger.error(f"합성된 파일이 생성되지 않음: {temp_output.name}")
	return None

	except Exception as e:
	logger.error(f"노트 합성 실패: {e}")
	return None

	def _midi_to_note_name(self, midi_note: int) -> str:
	"""MIDI 노트 번호를 음계명으로 변환"""
	notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
	octave = (midi_note // 12) - 1
	note = notes[midi_note % 12]
	return f"{note}{octave}"

	def get_available_phonemes(self) -> List[str]:
	"""사용 가능한 음소 목록 반환"""
	return self.voicebank.list_available_phonemes()

	def get_compression_info(self) -> Dict[str, any]:
	"""압축 정보 반환"""
	return self.voicebank.get_compression_info()