Spaces:

crlotwhite
/

UTAU-WebUI

Running

App Files Files Community

UTAU-WebUI / voice_data_converter.py

crlotwhite

Add UTAU WebUI project with LFS support for voice files

1056960 11 months ago

raw

history blame contribute delete

11.2 kB

	import h5py
	import numpy as np
	import soundfile as sf
	from pathlib import Path
	import logging
	import json
	import gzip
	from typing import Dict, List, Optional, Tuple, Union
	import shutil
	from utau_engine import VoicebankManager, OtoEntry

	logger = logging.getLogger(__name__)

	class VoiceDataCompressor:
	"""음성 데이터를 HDF5 형태로 압축/관리하는 클래스"""

	def __init__(self, output_path: str = "voice_data.h5"):
	self.output_path = Path(output_path)
	self.compression = 'gzip' # 압축 알고리즘
	self.compression_opts = 6 # 압축 레벨 (0-9)

	def convert_voicebank_to_hdf5(self, voicebank_path: Union[str, Path]) -> bool:
	"""보이스뱅크를 HDF5 형태로 변환"""
	try:
	voicebank_path = Path(voicebank_path)
	logger.info(f"보이스뱅크 변환 시작: {voicebank_path}")

	# VoicebankManager로 기존 데이터 로드
	vb_manager = VoicebankManager(voicebank_path)

	# HDF5 파일 생성
	with h5py.File(self.output_path, 'w') as h5file:
	# 메타데이터 그룹
	meta_group = h5file.create_group('metadata')

	# oto.ini 정보 저장
	oto_data = {}
	for alias, entry in vb_manager.oto_entries.items():
	oto_data[alias] = {
	'filename': entry.filename,
	'alias': entry.alias,
	'offset': entry.offset,
	'consonant': entry.consonant,
	'cutoff': entry.cutoff,
	'preutterance': entry.preutterance,
	'overlap': entry.overlap
	}

	# JSON으로 직렬화하여 압축 저장
	oto_json = json.dumps(oto_data, ensure_ascii=False, indent=2)
	oto_compressed = gzip.compress(oto_json.encode('utf-8'))
	meta_group.create_dataset('oto_data', data=np.frombuffer(oto_compressed, dtype=np.uint8))

	# 보이스뱅크 정보
	meta_group.attrs['voicebank_name'] = voicebank_path.name
	meta_group.attrs['total_entries'] = len(vb_manager.oto_entries)
	meta_group.attrs['total_wav_files'] = len(vb_manager.wav_files)

	# 오디오 데이터 그룹
	audio_group = h5file.create_group('audio_data')

	# 각 WAV 파일 처리
	processed_files = set()
	total_original_size = 0

	for filename, wav_path in vb_manager.wav_files.items():
	if filename in processed_files:
	continue

	try:
	# 오디오 로드
	audio_data, sample_rate = sf.read(wav_path)

	# 파일 크기 계산
	total_original_size += wav_path.stat().st_size

	# 스테레오 → 모노 변환
	if len(audio_data.shape) > 1:
	audio_data = np.mean(audio_data, axis=1)

	# 파일별 그룹 생성
	file_group = audio_group.create_group(filename.replace('.wav', ''))

	# 오디오 데이터 저장 (압축 적용)
	file_group.create_dataset(
	'audio',
	data=audio_data.astype(np.float32),
	compression=self.compression,
	compression_opts=self.compression_opts,
	shuffle=True, # 압축 효율 향상
	fletcher32=True # 체크섬 추가
	)

	# 메타데이터 저장
	file_group.attrs['sample_rate'] = sample_rate
	file_group.attrs['duration'] = len(audio_data) / sample_rate
	file_group.attrs['original_filename'] = filename

	processed_files.add(filename)
	logger.info(f"변환 완료: {filename} ({len(audio_data)} samples)")

	except Exception as e:
	logger.error(f"파일 처리 실패 {wav_path}: {e}")
	continue

	# 압축 통계
	compressed_size = self.output_path.stat().st_size
	compression_ratio = (1 - compressed_size / total_original_size) * 100

	meta_group.attrs['original_size_bytes'] = total_original_size
	meta_group.attrs['compressed_size_bytes'] = compressed_size
	meta_group.attrs['compression_ratio_percent'] = compression_ratio

	logger.info(f"변환 완료!")
	logger.info(f"원본 크기: {total_original_size / (1024*1024):.1f} MB")
	logger.info(f"압축 크기: {compressed_size / (1024*1024):.1f} MB")
	logger.info(f"압축율: {compression_ratio:.1f}%")

	return True

	except Exception as e:
	logger.error(f"HDF5 변환 실패: {e}")
	return False

	class CompressedVoicebankManager:
	"""압축된 HDF5 보이스뱅크를 관리하는 클래스"""

	def __init__(self, hdf5_path: Union[str, Path]):
	self.hdf5_path = Path(hdf5_path)
	self.oto_entries: Dict[str, OtoEntry] = {}
	self._audio_cache: Dict[str, Tuple[np.ndarray, int]] = {}
	self.cache_size_limit = 50 # 캐시할 최대 오디오 파일 수

	if not self.hdf5_path.exists():
	raise FileNotFoundError(f"압축된 보이스뱅크를 찾을 수 없습니다: {hdf5_path}")

	self.load_metadata()

	def load_metadata(self):
	"""HDF5에서 메타데이터 로드"""
	try:
	with h5py.File(self.hdf5_path, 'r') as h5file:
	# oto.ini 데이터 로드
	oto_compressed = h5file['metadata']['oto_data'][:]
	oto_json = gzip.decompress(oto_compressed.tobytes()).decode('utf-8')
	oto_data = json.loads(oto_json)

	# OtoEntry 객체로 변환
	for alias, data in oto_data.items():
	self.oto_entries[alias] = OtoEntry(
	filename=data['filename'],
	alias=data['alias'],
	offset=data['offset'],
	consonant=data['consonant'],
	cutoff=data['cutoff'],
	preutterance=data['preutterance'],
	overlap=data['overlap']
	)

	# 메타데이터 로그
	meta = h5file['metadata']
	logger.info(f"압축된 보이스뱅크 로드: {meta.attrs['voicebank_name']}")
	logger.info(f"총 {meta.attrs['total_entries']}개 엔트리")
	logger.info(f"압축율: {meta.attrs['compression_ratio_percent']:.1f}%")

	except Exception as e:
	logger.error(f"메타데이터 로드 실패: {e}")
	raise

	def get_audio_data(self, filename: str) -> Optional[Tuple[np.ndarray, int]]:
	"""특정 파일의 오디오 데이터 로드 (캐싱 지원)"""
	base_filename = filename.replace('.wav', '')

	# 캐시 확인
	if base_filename in self._audio_cache:
	return self._audio_cache[base_filename]

	try:
	with h5py.File(self.hdf5_path, 'r') as h5file:
	if base_filename not in h5file['audio_data']:
	return None

	file_group = h5file['audio_data'][base_filename]
	audio_data = file_group['audio'][:]
	sample_rate = file_group.attrs['sample_rate']

	# 캐시 관리 (LRU 방식)
	if len(self._audio_cache) >= self.cache_size_limit:
	# 가장 오래된 항목 제거
	oldest_key = next(iter(self._audio_cache))
	del self._audio_cache[oldest_key]

	# 캐시에 저장
	result = (audio_data, int(sample_rate))
	self._audio_cache[base_filename] = result

	return result

	except Exception as e:
	logger.error(f"오디오 데이터 로드 실패 {filename}: {e}")
	return None

	def get_sample_for_phoneme(self, phoneme: str) -> Optional[OtoEntry]:
	"""음소에 해당하는 샘플 찾기 (기존 로직과 동일)"""
	# 정확한 매치 먼저 시도
	if phoneme in self.oto_entries:
	return self.oto_entries[phoneme]

	# 유사한 발음 찾기
	candidates = []
	for alias in self.oto_entries:
	entry = self.oto_entries[alias]
	if entry.clean_alias == phoneme:
	candidates.append(entry)

	if candidates:
	# 숨소리가 아닌 것을 우선
	non_breath = [c for c in candidates if not c.is_breath]
	return non_breath[0] if non_breath else candidates[0]

	return None

	def list_available_phonemes(self) -> List[str]:
	"""사용 가능한 음소 목록"""
	return list(set(entry.clean_alias for entry in self.oto_entries.values()))

	def get_compression_info(self) -> Dict[str, any]:
	"""압축 정보 반환"""
	try:
	with h5py.File(self.hdf5_path, 'r') as h5file:
	meta = h5file['metadata']
	return {
	'voicebank_name': meta.attrs['voicebank_name'],
	'total_entries': meta.attrs['total_entries'],
	'original_size_mb': meta.attrs['original_size_bytes'] / (1024*1024),
	'compressed_size_mb': meta.attrs['compressed_size_bytes'] / (1024*1024),
	'compression_ratio': meta.attrs['compression_ratio_percent'],
	'file_path': str(self.hdf5_path)
	}
	except Exception as e:
	logger.error(f"압축 정보 로드 실패: {e}")
	return {}

	def convert_voicebank_to_compressed_format(voicebank_path: str, output_path: str = None) -> bool:
	"""보이스뱅크를 압축 형태로 변환하는 편의 함수"""
	if output_path is None:
	voicebank_name = Path(voicebank_path).name.replace(' ', '_')
	output_path = f"voice/{voicebank_name}_compressed.h5"

	converter = VoiceDataCompressor(output_path)
	return converter.convert_voicebank_to_hdf5(voicebank_path)

	if __name__ == "__main__":
	# 테스트용 변환
	success = convert_voicebank_to_compressed_format("voice/hanseol CVC")
	if success:
	print("✅ 보이스뱅크 압축 변환 완료!")

	# 압축된 버전 테스트
	compressed_vb = CompressedVoicebankManager("voice/hanseol_CVC_compressed.h5")
	print(f"📊 압축 정보: {compressed_vb.get_compression_info()}")
	print(f"🎤 사용 가능한 음소: {len(compressed_vb.list_available_phonemes())}개")
	else:
	print("❌ 보이스뱅크 압축 실패!")