Spaces:

crlotwhite
/

UTAU-WebUI

Running

App Files Files Community

crlotwhite commited on Jun 11, 2025

Commit

1056960

1 Parent(s): 35c6482

Add UTAU WebUI project with LFS support for voice files

Browse files

Files changed (17) hide show

.gitignore +11 -0
.python-version +1 -0
COMPRESSION_REPORT.md +1 -0
Makefile +168 -0
README.md +211 -1
app.py +650 -0
compressed_utau_engine.py +208 -0
pyproject.toml +32 -0
requirements.txt +809 -0
straycat.py +825 -0
test_compressed_voicebank.py +158 -0
utau_engine.py +467 -0
uv.lock +0 -0
voice/hanseol_CVC_compressed.h5 +3 -0
voice/test_voice.sc.npz +3 -0
voice/test_voice.wav +0 -0
voice_data_converter.py +264 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+.DS_Store

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

COMPRESSION_REPORT.md ADDED Viewed

	@@ -0,0 +1 @@


1	+

Makefile ADDED Viewed

	@@ -0,0 +1,168 @@

+.PHONY: help setup install compress run clean test dev check-deps status
+# 기본 설정
+PYTHON := uv run python
+UV := uv
+VOICEBANK_DIR := voice/hanseol CVC
+COMPRESSED_FILE := voice/hanseol_CVC_compressed.h5
+PORT := 7860
+# 기본 타겟
+help: ## 도움말 출력
+	@echo "🎵 UTAU WebUI - 개발 환경 자동화 도구"
+	@echo ""
+	@echo "📋 사용 가능한 명령어:"
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2}'
+	@echo ""
+	@echo "🚀 빠른 시작:"
+	@echo "  1. make setup     # 개발 환경 설정"
+	@echo "  2. make compress  # 보이스뱅크 압축"
+	@echo "  3. make run       # 웹UI 실행"
+	@echo ""
+setup: ## 개발 환경 초기 설정
+	@echo "🔧 개발 환경을 설정합니다..."
+	@if ! command -v uv >/dev/null 2>&1; then \
+		echo "❌ uv가 설치되지 않았습니다. https://docs.astral.sh/uv/ 에서 설치하세요."; \
+		exit 1; \
+	fi
+	@echo "📦 의존성을 설치합니다..."
+	$(UV) sync
+	@echo "📁 필요한 디렉토리를 생성합니다..."
+	@mkdir -p voice
+	@echo "✅ 개발 환경 설정 완료!"
+install: setup ## setup의 별칭
+check-deps: ## 의존성 및 환경 확인
+	@echo "🔍 환경을 확인합니다..."
+	@echo "UV 버전: $$($(UV) --version 2>/dev/null || echo '❌ uv 없음')"
+	@echo "Python 버전: $$($(PYTHON) --version 2>/dev/null || echo '❌ Python 없음')"
+	@if [ -f "$(COMPRESSED_FILE)" ]; then \
+		echo "✅ 압축된 보이스뱅크: $(COMPRESSED_FILE)"; \
+		$(PYTHON) -c "import h5py; f=h5py.File('$(COMPRESSED_FILE)', 'r'); print(f'📊 메타데이터: {dict(f[\"metadata\"].attrs)}')"; \
+	else \
+		echo "❌ 압축된 보이스뱅크가 없음: $(COMPRESSED_FILE)"; \
+	fi
+	@if [ -d "$(VOICEBANK_DIR)" ]; then \
+		echo "✅ 원본 보이스뱅크: $(VOICEBANK_DIR) ($$(find "$(VOICEBANK_DIR)" -name "*.wav" | wc -l)개 WAV 파일)"; \
+	else \
+		echo "❌ 원본 보이스뱅크가 없음: $(VOICEBANK_DIR)"; \
+	fi
+compress: ## 보이스뱅크를 HDF5 형태로 압축
+	@echo "🗜️  보이스뱅크를 압축합니다..."
+	@if [ ! -d "$(VOICEBANK_DIR)" ]; then \
+		echo "❌ 원본 보이스뱅크를 찾을 수 없습니다: $(VOICEBANK_DIR)"; \
+		echo "📋 해결 방법:"; \
+		echo "  1. hanseol CVC 보이스뱅크를 $(VOICEBANK_DIR) 에 배치"; \
+		echo "  2. 또는 다른 보이스뱅크를 사용하려면:"; \
+		echo "     make compress VOICEBANK_DIR=your/voicebank/path"; \
+		exit 1; \
+	fi
+	@echo "📁 원본 위치: $(VOICEBANK_DIR)"
+	@echo "💾 압축 파일: $(COMPRESSED_FILE)"
+	$(PYTHON) -c "\
+from voice_data_converter import convert_voicebank_to_compressed_format; \
+success = convert_voicebank_to_compressed_format('$(VOICEBANK_DIR)', '$(COMPRESSED_FILE)'); \
+print('✅ 압축 완료!' if success else '❌ 압축 실패')"
+	@echo "🎉 보이스뱅크 압축이 완료되었습니다!"
+run: ## 웹UI 실행
+	@echo "🚀 UTAU WebUI를 시작합니다..."
+	@if [ ! -f "$(COMPRESSED_FILE)" ]; then \
+		echo "❌ 압축된 보이스뱅크가 없습니다."; \
+		echo "📋 먼저 다음 명령어를 실행하세요: make compress"; \
+		exit 1; \
+	fi
+	@echo "🌐 웹 브라우저에서 http://localhost:$(PORT) 을 열어주세요"
+	$(PYTHON) webui.py
+dev: ## 개발 모드로 실행 (auto-reload)
+	@echo "🔧 개발 모드로 UTAU WebUI를 시작합니다..."
+	@if [ ! -f "$(COMPRESSED_FILE)" ]; then \
+		echo "❌ 압축된 보이스뱅크가 없습니다."; \
+		echo "📋 먼저 다음 명령어를 실행하세요: make compress"; \
+		exit 1; \
+	fi
+	@echo "🌐 웹 브라우저에서 http://localhost:$(PORT) 을 열어주세요"
+	@echo "🔄 파일 변경 시 자동으로 재시작됩니다"
+	$(UV) run --env GRADIO_AUTO_RELOAD=1 python webui.py
+test: ## 압축된 보이스뱅크 테스트
+	@echo "🧪 압축된 보이스뱅크를 테스트합니다..."
+	@if [ ! -f "$(COMPRESSED_FILE)" ]; then \
+		echo "❌ 압축된 보이스뱅크가 없습니다: $(COMPRESSED_FILE)"; \
+		echo "📋 먼저 다음 명령어를 실행하세요: make compress"; \
+		exit 1; \
+	fi
+	$(PYTHON) test_compressed_voicebank.py
+status: ## 현재 상태 확인
+	@echo "📊 UTAU WebUI 상태"
+	@echo "===================="
+	@make check-deps
+	@echo ""
+	@if [ -f "$(COMPRESSED_FILE)" ] && [ -d "$(VOICEBANK_DIR)" ]; then \
+		echo "🎉 모든 준비가 완료되었습니다! 'make run'으로 시작하세요."; \
+	elif [ -f "$(COMPRESSED_FILE)" ]; then \
+		echo "✅ 압축된 보이스뱅크가 준비되었습니다! 'make run'으로 시작하세요."; \
+	elif [ -d "$(VOICEBANK_DIR)" ]; then \
+		echo "⚠️  보이스뱅크가 있지만 압축되지 않았습니다. 'make compress'를 실행하세요."; \
+	else \
+		echo "❌ 보이스뱅크가 없습니다. 먼저 보이스뱅크를 준비하고 'make compress'를 실행하세요."; \
+	fi
+clean: ## 임시 파일 및 캐시 정리
+	@echo "🧹 임시 파일을 정리합니다..."
+	@find . -type f -name "*.pyc" -delete 2>/dev/null || true
+	@find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
+	@find . -type f -name "*.tmp" -delete 2>/dev/null || true
+	@find . -type f -name ".DS_Store" -delete 2>/dev/null || true
+	@rm -rf .pytest_cache 2>/dev/null || true
+	@echo "✅ 정리 완료!"
+clean-all: clean ## 모든 생성된 파일 삭제 (압축 파일 포함)
+	@echo "🗑️  모든 생성된 파일을 삭제합니다..."
+	@if [ -f "$(COMPRESSED_FILE)" ]; then \
+		echo "⚠️  압축된 보이스뱅크도 삭제됩니다: $(COMPRESSED_FILE)"; \
+		read -p "계속하시겠습니까? (y/N): " confirm; \
+		if [ "$$confirm" = "y" ] || [ "$$confirm" = "Y" ]; then \
+			rm -f "$(COMPRESSED_FILE)"; \
+			echo "✅ 모든 파일이 삭제되었습니다."; \
+		else \
+			echo "❌ 취소되었습니다."; \
+		fi \
+	else \
+		echo "✅ 정리할 파일이 없습니다."; \
+	fi
+# 개발자를 위한 추가 명령어
+update: ## 의존성 업데이트
+	@echo "📦 의존성을 업데이트합니다..."
+	$(UV) sync --upgrade
+	@echo "✅ 업데이트 완료!"
+shell: ## 프로젝트 쉘 진입
+	@echo "🐚 프로젝트 쉘에 진입합니다..."
+	$(UV) shell
+info: ## 프로젝트 정보 출력
+	@echo "📋 UTAU WebUI 프로젝트 정보"
+	@echo "============================"
+	@echo "프로젝트: UTAU WebUI"
+	@echo "설명: 한국어 CVC 보이스뱅크를 사용한 웹 기반 UTAU 음성 합성기"
+	@echo "기술 스택: Python, Gradio, HDF5, UV"
+	@echo "포트: $(PORT)"
+	@echo "보이스뱅크: $(VOICEBANK_DIR)"
+	@echo "압축 파일: $(COMPRESSED_FILE)"
+	@echo ""
+	@echo "📁 디렉토리 구조:"
+	@find . -maxdepth 2 -type f -name "*.py" | head -10
+	@echo ""
+# 전체 워크플로우
+all: setup compress run ## 전체 설정 및 실행 (setup → compress → run)
+# 기본 타겟을 help로 설정
+.DEFAULT_GOAL := help

README.md CHANGED Viewed

@@ -9,5 +9,215 @@ app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 license: mit
 ---
+# 🎵 UTAU WebUI - 한국어 음성 합성기
+피아노롤 기반의 웹 UTAU 음성 합성 시스템입니다. 한국어 CVC 보이스뱅크를 사용하여 자연스러운 한국어 음성을 합성할 수 있습니다.
+## ✨ 특징
+- 🎹 **직관적인 피아노롤 인터페이스**: 웹 브라우저에서 바로 사용 가능한 피아노롤 편집기
+- 🇰🇷 **한국어 음성 합성**: hanseol CVC 보이스뱅크를 사용한 고품질 한국어 음성 합성
+- 🗜️ **HDF5 압축 시스템**: 47.7MB → 33.9MB (29% 압축), 46개 파일 → 1개 파일로 최적화
+- 🚀 **자동화된 워크플로우**: Makefile을 통한 원클릭 설정, 압축, 실행
+- 🎵 **실시간 편집**: 노트 추가, 삭제, 가사 입력이 실시간으로 가능
+- 🔊 **웨이브폼 시각화**: 합성된 음성의 웨이브폼을 피아노롤에서 바로 확인
+- 🎤 **CVC 음소 시스템**: 585개의 한국어 CVC 음소로 자연스러운 발음 구현
+- ☁️ **클라우드 최적화**: Gradio 및 Hugging Face Spaces 환경에 최적화
+## 🎤 보이스뱅크 정보
+- **보이스뱅크**: hanseol CVC (HDF5 압축)
+- **CV (Character Voice)**: KUNGOM
+- **UTAU**: KITANE 백한설
+- **음소 수**: 585개 CVC 음소
+- **언어**: 한국어
+- **압축율**: 29% (원본 47.7MB → 압축 33.9MB)
+## 🚀 빠른 시작
+### 필요 조건
+- Python 3.12+
+- [uv](https://docs.astral.sh/uv/) (Python 패키지 관리자)
+- GNU Make (자동화 스크립트용)
+### uv 설치
+```bash
+# macOS/Linux
+curl -LsSf https://astral.sh/uv/install.sh | sh
+# Windows
+powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
+```
+### 🎯 한 번에 설정하고 실행하기
+```bash
+# 저장소 클론
+git clone <repository-url>
+cd utau-webui
+# 모든 설정과 실행을 한 번에
+make all
+```
+### 📋 단계별 실행
+```bash
+# 1. 개발 환경 설정
+make setup
+# 2. 보이스뱅크 압축 (최초 1회만)
+make compress
+# 3. 웹UI 실행
+make run
+```
+### 🔍 현재 상태 확인
+```bash
+# 프로젝트 상태 확인
+make status
+# 의존성 및 환경 확인
+make check-deps
+```
+## 📋 Makefile 명령어
+| 명령어 | 설명 |
+|--------|------|
+| `make help` | 사용 가능한 모든 명령어 표시 |
+| `make setup` | 개발 환경 초기 설정 (의존성 설치) |
+| `make compress` | 보이스뱅크를 HDF5 형태로 압축 |
+| `make run` | 웹UI 실행 |
+| `make dev` | 개발 모드로 실행 (auto-reload) |
+| `make test` | 압축된 보이스뱅크 테스트 |
+| `make status` | 현재 프로젝트 상태 확인 |
+| `make clean` | 임시 파일 및 캐시 정리 |
+| `make all` | 전체 설정 및 실행 (setup → compress → run) |
+## 🎼 사용법
+1. **노트 추가**: 피아노롤에서 원하는 위치를 클릭하여 노트 추가
+2. **가사 입력**: 노트를 더블클릭하여 한국어 가사 입력
+3. **노트 편집**: 드래그하여 노트 길이 조정, 위아래로 드래그하여 음높이 조정
+4. **음성 합성**: "🎵 음성 합성" 버튼 클릭하여 음성 생성
+5. **재생**: 생성된 음성을 바로 들어보거나 다운로드
+### 지원하는 한국어 음소
+- **기본 모음**: 아, 이, 우, 에, 오, 으, 어
+- **자음+모음 조합**: 바, 다, 가, 하, 자, 카, 라, 마, 나, 파, 사, 타 등
+- **복합 모음**: 야, 예, 여, 요, 유, 의, 와, 웨, 위, 워
+- **도레미 음계**: 도, 레, 미, 파, 솔, 라, 시
+## 🗜️ HDF5 압축 시스템
+### 장점
+- **파일 관리 최적화**: 46개 WAV 파일 → 1개 HDF5 파일
+- **용량 최적화**: 29% 압축 효율 (47.7MB → 33.9MB)
+- **성능 향상**: 더 빠른 로딩 및 배포
+- **클라우드 친화적**: Hugging Face Spaces 등 클라우드 환경에 최적화
+### 압축 과정
+```bash
+# 자동 압축
+make compress
+# 수동 압축
+uv run python -c "from voice_data_converter import convert_voicebank_to_compressed_format; convert_voicebank_to_compressed_format('voice/hanseol CVC')"
+```
+## 🛠️ 기술 스택
+- **Frontend**: Gradio + Custom PianoRoll Component
+- **Backend**: Python
+- **음성 합성**: UTAU Engine + Straycat Resampler
+- **데이터 압축**: HDF5 (with gzip compression)
+- **오디오 처리**: SoundFile, NumPy
+- **패키지 관리**: uv
+- **자동화**: GNU Make
+## 🔧 개발하기
+### 개발 환경 설정
+```bash
+# 전체 개발 환경 설정
+make setup
+# 개발 모드로 실행 (파일 변경 시 자동 재시작)
+make dev
+# 쉘 진입
+make shell
+```
+### 의존성 관리
+```bash
+# 의존성 추가
+uv add <package-name>
+# 개발 의존성 추가
+uv add --dev <package-name>
+# 의존성 업데이트
+make update
+```
+### 프로젝트 정보
+```bash
+# 프로젝트 정보 확인
+make info
+# 의존성 트리 확인
+uv tree
+```
+## 🚨 문제 해결
+### 압축된 보이스뱅크가 없는 경우
+```bash
+# 상태 확인
+make status
+# 보이스뱅크 압축
+make compress
+```
+### 원본 보이스뱅크가 없는 경우
+1. hanseol CVC 보이스뱅크를 `voice/hanseol CVC` 디렉토리에 배치
+2. `make compress` 실행
+### 환경 문제
+```bash
+# 환경 확인
+make check-deps
+# 의존성 재설치
+make setup
+```
+## 📝 저작권 및 라이선스
+### UTAU WebUI
+본 프로젝트는 오픈소스 소프트웨어입니다.
+### Straycat Resampler
+본 프로젝트에서 사용하는 UTAU 리샘플러는 [straycat](https://github.com/UtaUtaUtau/straycat)을 기반으로 합니다.
+**원본 저장소**: https://github.com/UtaUtaUtau/straycat
+**라이선스**: MIT License
+**저작권**: Copyright (c) UtaUtaUtau
+> Yet another WORLD-based UTAU resampler.
+MIT 라이선스에 따라 본 프로젝트에 포함되었으며, 원본 저작권 표시를 유지합니다.
+### hanseol CVC 보이스뱅크
+- **CV**: KUNGOM
+- **UTAU**: KITANE 백한설
+해당 보이스뱅크의 사용 권한에 대해서는 원 저작권자의 이용 약관을 따릅니다.

app.py ADDED Viewed

	@@ -0,0 +1,650 @@

+import gradio as gr
+import gradio_pianoroll as grp
+import tempfile
+import os
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+from straycat import Resampler
+import logging
+import json
+import base64
+import io
+import wave
+from compressed_utau_engine import CompressedUTAUEngine
+import os
+# 로깅 설정
+logging.basicConfig(level=logging.INFO)
+# 압축된 보이스뱅크만 사용
+utau_engine = None
+USE_UTAU = False
+# 압축된 hanseol CVC 보이스뱅크 경로
+compressed_hanseol_path = "voice/hanseol_CVC_compressed.h5"
+logger = logging.getLogger(__name__)
+# 압축된 보이스뱅크 로드 (필수)
+if os.path.exists(compressed_hanseol_path):
+    try:
+        utau_engine = CompressedUTAUEngine(compressed_hanseol_path)
+        USE_UTAU = True
+        available_phonemes = utau_engine.get_available_phonemes()
+        compression_info = utau_engine.get_compression_info()
+        logger.info(f"✅ 압축된 hanseol CVC 보이스뱅크 로드 완료: {len(available_phonemes)}개 음소")
+        logger.info(f"📊 압축율: {compression_info.get('compression_ratio', 0):.1f}%")
+        logger.info(f"💾 압축된 파일 크기: {compression_info.get('compressed_size_bytes', 0) / (1024*1024):.1f} MB")
+    except Exception as e:
+        logger.error(f"❌ 압축된 보이스뱅크 로드 실패: {e}")
+        print(f"\n{'='*60}")
+        print("🚨 압축된 보이스뱅크 로드 실패!")
+        print(f"파일 경로: {compressed_hanseol_path}")
+        print(f"오류: {e}")
+        print("\n📋 해결 방법:")
+        print("1. 다음 명령어로 보이스뱅크를 압축하세요:")
+        print("   make compress")
+        print("2. 또는 수동으로 실행:")
+        print("   uv run python voice_data_converter.py")
+        print(f"{'='*60}\n")
+        USE_UTAU = False
+else:
+    logger.error(f"❌ 압축된 보이스뱅크 파일을 찾을 수 없음: {compressed_hanseol_path}")
+    print(f"\n{'='*60}")
+    print("🚨 압축된 보이스뱅크 파일이 없습니다!")
+    print(f"예상 위치: {compressed_hanseol_path}")
+    print("\n📋 해결 방법:")
+    print("1. 원본 보이스뱅크가 있다면 압축하세요:")
+    print("   make compress")
+    print("2. 또는 수동으로 실행:")
+    print("   uv run python -c \"from voice_data_converter import convert_voicebank_to_compressed_format; convert_voicebank_to_compressed_format('voice/hanseol CVC')\"")
+    print("\n3. 보이스뱅크 다운로드가 필요한 경우:")
+    print("   - hanseol CVC 보이스뱅크를 voice/ 디렉토리에 배치")
+    print("   - 그 후 위의 압축 명령어 실행")
+    print(f"{'='*60}\n")
+    USE_UTAU = False
+# 압축된 보이스뱅크가 없으면 경고 메시지와 함께 제한된 기능만 제공
+if not USE_UTAU:
+    available_phonemes = []
+    logger.warning("⚠️ 압축된 보이스뱅크 없이 제한된 모드로 실행됩니다.")
+    logger.warning("⚠️ 음성 합성 기능을 사용하려면 먼저 보이스뱅크를 압축하세요.")
+def audio_to_base64_wav(audio_data, sample_rate):
+    """Convert audio data to base64 encoded WAV string"""
+    if audio_data is None or len(audio_data) == 0:
+        return None
+    # Normalize audio data to [-1, 1] range
+    if np.max(np.abs(audio_data)) > 0:
+        audio_data = audio_data / np.max(np.abs(audio_data))
+    # Convert to 16-bit PCM
+    audio_16bit = (audio_data * 32767).astype(np.int16)
+    # Create WAV file in memory
+    buffer = io.BytesIO()
+    with wave.open(buffer, 'wb') as wav_file:
+        wav_file.setnchannels(1)  # Mono
+        wav_file.setsampwidth(2)  # 16-bit
+        wav_file.setframerate(sample_rate)
+        wav_file.writeframes(audio_16bit.tobytes())
+    # base64 encoding
+    buffer.seek(0)
+    wav_data = buffer.read()
+    base64_data = base64.b64encode(wav_data).decode('utf-8')
+    return f"data:audio/wav;base64,{base64_data}"
+def calculate_waveform_data(audio_data, pixels_per_beat, tempo, target_width=1000):
+    """Calculate waveform visualization data from audio data"""
+    if audio_data is None or len(audio_data) == 0:
+        return None
+    sample_rate = 44100
+    # Calculate total audio duration (seconds)
+    audio_duration = len(audio_data) / sample_rate
+    # Calculate total pixel length (based on tempo and pixels per beat)
+    total_pixels = (tempo / 60) * pixels_per_beat * audio_duration
+    # Calculate samples per pixel
+    samples_per_pixel = len(audio_data) / total_pixels
+    waveform_points = []
+    # Calculate min/max values for each pixel
+    for pixel in range(int(total_pixels)):
+        start_sample = int(pixel * samples_per_pixel)
+        end_sample = int((pixel + 1) * samples_per_pixel)
+        end_sample = min(end_sample, len(audio_data))
+        if start_sample >= len(audio_data):
+            break
+        if start_sample < end_sample:
+            # Audio data for the pixel range
+            pixel_data = audio_data[start_sample:end_sample]
+            # Calculate min, max values
+            min_val = float(np.min(pixel_data))
+            max_val = float(np.max(pixel_data))
+            # Time information (pixel position)
+            time_position = pixel
+            waveform_points.append({
+                'x': time_position,
+                'min': min_val,
+                'max': max_val
+            })
+    return waveform_points
+def add_waveform_to_pianoroll(pianoroll_data, audio_data, sample_rate, tempo):
+    """Add waveform data to pianoroll for visualization - demo/app.py와 동일한 방식"""
+    # demo/app.py와 동일한 방식으로 완전히 복사
+    updated_pianoroll = pianoroll_data.copy() if pianoroll_data else {}
+    # Add backend audio data
+    audio_base64 = audio_to_base64_wav(audio_data, sample_rate)
+    updated_pianoroll['audio_data'] = audio_base64
+    updated_pianoroll['use_backend_audio'] = True
+    # Get tempo and pixels per beat from pianoroll data
+    pixels_per_beat = updated_pianoroll.get('pixelsPerBeat', 80)
+    # Calculate waveform data
+    waveform_data = calculate_waveform_data(audio_data, pixels_per_beat, tempo)
+    # demo/app.py와 동일한 curve_data 처리 방식
+    curve_data = {}
+    # Add waveform data to curve_data
+    if waveform_data:
+        curve_data['waveform_data'] = waveform_data
+        print(f"Waveform data created: {len(waveform_data)} points")
+    # Set curve data for piano roll (demo/app.py와 동일)
+    if curve_data:
+        updated_pianoroll['curve_data'] = curve_data
+    # demo/app.py와 같은 방식으로 segment_data 추가
+    if 'notes' in updated_pianoroll and updated_pianoroll['notes']:
+        segment_data = []
+        for i, note in enumerate(updated_pianoroll['notes']):
+            start_seconds = note.get('startSeconds', 0)
+            duration_seconds = note.get('durationSeconds', 0.5)
+            segment_data.append({
+                'start': start_seconds,
+                'end': start_seconds + duration_seconds,
+                'type': 'note',
+                'value': note.get('lyric', f"Note_{i+1}"),
+                'confidence': 0.95
+            })
+        updated_pianoroll['segment_data'] = segment_data
+    # 상세한 디버깅 로그 (demo/app.py와 동일한 형식)
+    print(f"🔊 [add_waveform_to_pianoroll] Setting backend audio data:")
+    print(f"   - audio_data length: {len(audio_base64) if audio_base64 else 0}")
+    print(f"   - use_backend_audio: {updated_pianoroll['use_backend_audio']}")
+    print(f"   - waveform points: {len(waveform_data) if waveform_data else 0}")
+    print(f"   - Updated pianoroll keys: {list(updated_pianoroll.keys())}")
+    return updated_pianoroll
+def create_test_voice_sample():
+    """테스트용 간단한 음성 샘플 생성"""
+    voice_dir = Path("voice")
+    voice_dir.mkdir(exist_ok=True)
+    sample_path = voice_dir / "test_voice.wav"
+    if not sample_path.exists():
+        # 간단한 사인파 기반 음성 샘플 생성 (A4 = 440Hz)
+        duration = 1.0  # 1초
+        sample_rate = 44100
+        t = np.linspace(0, duration, int(sample_rate * duration), False)
+        # 기본 주파수 (A4)
+        fundamental = 440.0
+        # 하모닉을 추가한 더 자연스러운 소리
+        signal = (np.sin(2 * np.pi * fundamental * t) * 0.5 +
+                 np.sin(2 * np.pi * fundamental * 2 * t) * 0.2 +
+                 np.sin(2 * np.pi * fundamental * 3 * t) * 0.1 +
+                 np.sin(2 * np.pi * fundamental * 4 * t) * 0.05)
+        # ADSR 엔벨로프 적용
+        attack = int(0.05 * sample_rate)
+        decay = int(0.1 * sample_rate)
+        sustain_level = 0.7
+        release = int(0.2 * sample_rate)
+        sustain = len(signal) - attack - decay - release
+        envelope = np.ones_like(signal)
+        envelope[:attack] = np.linspace(0, 1, attack)
+        envelope[attack:attack+decay] = np.linspace(1, sustain_level, decay)
+        envelope[attack+decay:attack+decay+sustain] = sustain_level
+        envelope[-release:] = np.linspace(sustain_level, 0, release)
+        signal = signal * envelope
+        # 포먼트 필터 추가 (간단한 음성 특성)
+        from scipy import signal as scipy_signal
+        # 음성 특성을 모방한 간단한 필터
+        b, a = scipy_signal.butter(2, [300, 3000], btype='band', fs=sample_rate)
+        signal = scipy_signal.filtfilt(b, a, signal)
+        # 노이즈 추가로 더 자연스럽게
+        noise = np.random.normal(0, 0.02, len(signal))
+        signal = signal + noise
+        # 정규화
+        signal = signal / np.max(np.abs(signal)) * 0.8
+        sf.write(sample_path, signal, sample_rate)
+        logging.info(f"테스트 음성 샘플 생성: {sample_path}")
+    return sample_path
+def midi_to_note_name(midi_note):
+    """MIDI 노트 번호를 노트 이름으로 변환"""
+    notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
+    octave = (midi_note // 12) - 1
+    note = notes[midi_note % 12]
+    return f"{note}{octave}"
+def synthesize_notes(pianoroll_data, use_utau_engine):
+    """피아노롤 데이터를 받아서 음성을 합성하고 웨이브폼을 피아노롤에 업데이트"""
+    if not pianoroll_data or not pianoroll_data.get('notes'):
+        return pianoroll_data, None, "노트가 없습니다. 피아노롤에 노트를 추가하세요."
+    try:
+        notes = pianoroll_data['notes']
+        # 기본값 설정
+        velocity_setting = 100
+        volume_setting = 100
+        use_vibrato = False
+        vibrato_depth = 20
+        # 피아노롤에서 tempo 가져오기 (기본값: 120)
+        tempo = pianoroll_data.get('tempo', 120)
+        logging.info(f"합성할 노트 수: {len(notes)}, 템포: {tempo} BPM (피아노롤에서 가져옴)")
+        # 피아노롤 노트에서 가사 추출
+        lyrics = []
+        for note in notes:
+            lyric = note.get('lyric', '').strip()
+            if not lyric:
+                lyric = "あ"  # 가사가 없으면 기본 일본어 음소
+            lyrics.append(lyric)
+        logging.info(f"추출된 가사: {lyrics}")
+        # UTAU 엔진 사용 여부 결정
+        use_utau = use_utau_engine.startswith("UTAU 엔진")
+        if use_utau and USE_UTAU and utau_engine:
+            # UTAU 엔진으로 합성
+            audio_file, status = utau_engine.synthesize_sequence(
+                notes=notes,
+                lyrics=lyrics,
+                tempo=tempo,  # 피아노롤의 tempo 사용
+                volume=volume_setting
+            )
+            if audio_file:
+                # 합성된 오디오 로드
+                audio_data, sample_rate = sf.read(audio_file)
+                # 피아노롤에 웨이브폼 데이터 추가
+                updated_pianoroll = add_waveform_to_pianoroll(
+                    pianoroll_data, audio_data, sample_rate, tempo  # 피아노롤의 tempo 사용
+                )
+                return updated_pianoroll, audio_file, status
+            else:
+                return pianoroll_data, None, status
+        # 기본 엔진으로 합성 (기존 코드)
+        # 테스트 음성 샘플 준비
+        voice_sample = create_test_voice_sample()
+        # 전체 길이 계산 (가장 늦게 끝나는 노트 기준) - 초 단위로 계산
+        max_end_time_seconds = max(note.get('endSeconds', note.get('startSeconds', 0) + note.get('durationSeconds', 0.5)) for note in notes)
+        max_end_time = max_end_time_seconds * 1000  # 밀리초로 변환
+        sample_rate = 44100
+        total_samples = int(max_end_time * sample_rate / 1000) + sample_rate  # 여유분 추가
+        # 최종 오디오 버퍼
+        final_audio = np.zeros(total_samples)
+        # 각 노트를 개별적으로 합성하고 믹싱
+        for i, note in enumerate(notes):
+            try:
+                pitch = note['pitch']
+                start_ms = note.get('startSeconds', 0) * 1000  # 초를 밀리초로 변환
+                duration_ms = note.get('durationSeconds', 0.5) * 1000  # 초를 밀리초로 변환
+                velocity = note.get('velocity', velocity_setting)
+                note_name = midi_to_note_name(pitch)
+                start_seconds = note.get('startSeconds', 0)
+                logging.info(f"노트 {i+1}: {note_name} (MIDI {pitch}), 시작: {start_seconds}s ({start_ms}ms), 길이: {duration_ms}ms")
+                # 임시 출력 파일
+                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
+                    temp_output = temp_file.name
+                # 플래그 설정
+                flags = ''
+                if use_vibrato:
+                    flags += f'A{int(vibrato_depth)}'
+                try:
+                    # straycat Resampler로 합성 (length를 노트 길이에 맞게 설정)
+                    resampler = Resampler(
+                        in_file=str(voice_sample),
+                        out_file=temp_output,
+                        pitch=note_name,
+                        velocity=velocity,
+                        length=max(duration_ms, 200),  # 최소 200ms 보장
+                        volume=volume_setting,
+                        flags=flags,
+                        offset=0,
+                        consonant=20,  # 약간의 자연스러운 어택
+                        cutoff=0,
+                        modulation=10,  # 약간의 모듈레이션
+                        tempo=f'!{int(tempo)}'  # 피아노롤의 tempo 사용
+                    )
+                    # 합성된 오디오 로드
+                    if os.path.exists(temp_output):
+                        synth_audio, _ = sf.read(temp_output)
+                        # 오디오를 올바른 위치에 배치
+                        start_sample = int(start_ms * sample_rate / 1000)
+                        end_sample = start_sample + len(synth_audio)
+                        if end_sample <= len(final_audio):
+                            final_audio[start_sample:end_sample] += synth_audio * (velocity / 100)
+                        else:
+                            # 버퍼가 부족하면 확장
+                            new_size = end_sample + sample_rate
+                            new_final_audio = np.zeros(new_size)
+                            new_final_audio[:len(final_audio)] = final_audio
+                            new_final_audio[start_sample:end_sample] += synth_audio * (velocity / 100)
+                            final_audio = new_final_audio
+                        logging.info(f"노트 {i+1} 합성 완료")
+                except Exception as e:
+                    logging.error(f"노트 {i+1} 합성 실패: {e}")
+                    continue
+                finally:
+                    # 임시 파일 정리
+                    if os.path.exists(temp_output):
+                        os.unlink(temp_output)
+            except Exception as e:
+                logging.error(f"노트 {i+1} 처리 실패: {e}")
+                continue
+        # 최종 오디오 정규화 및 마스터링
+        if np.max(np.abs(final_audio)) > 0:
+            # 컴프레서 효과 (간단한 버전)
+            threshold = 0.7
+            ratio = 4.0
+            # 피크 검출
+            abs_audio = np.abs(final_audio)
+            over_threshold = abs_audio > threshold
+            # 컴프레션 적용
+            compressed = final_audio.copy()
+            compressed[over_threshold] = (
+                np.sign(final_audio[over_threshold]) *
+                (threshold + (abs_audio[over_threshold] - threshold) / ratio)
+            )
+            final_audio = compressed / np.max(np.abs(compressed)) * 0.85
+        # 최종 파일 저장
+        output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+        sf.write(output_file.name, final_audio, sample_rate)
+        output_file.close()
+        # 피아노롤에 웨이브폼 데이터 추가
+        updated_pianoroll = add_waveform_to_pianoroll(
+            pianoroll_data, final_audio, sample_rate, tempo  # 피아노롤의 tempo 사용
+        )
+        duration_sec = len(final_audio) / sample_rate
+        logging.info(f"합성 완료: {len(notes)}개 노트, 총 길이: {duration_sec:.2f}초, 템포: {tempo} BPM")
+        return updated_pianoroll, output_file.name, f"✅ 합성 완료: {len(notes)}개 노트, {duration_sec:.1f}초, 템포: {tempo} BPM"
+    except Exception as e:
+        error_msg = f"❌ 합성 중 오류 발생: {str(e)}"
+        logging.error(error_msg)
+        return pianoroll_data, None, error_msg
+def create_example_melody():
+    """예제 멜로디 생성 - demo/app.py와 동일한 방식"""
+    # demo/app.py와 동일한 노트 구조 사용 (id 추가)
+    notes = [
+        {
+            "id": "note_0",
+            "start": 0,
+            "duration": 160,
+            "pitch": 60,  # C4
+            "velocity": 100,
+            "lyric": "도",
+            "startSeconds": 0.0,
+            "durationSeconds": 0.5,
+            "endSeconds": 0.5
+        },
+        {
+            "id": "note_1",
+            "start": 160,
+            "duration": 160,
+            "pitch": 62,  # D4
+            "velocity": 100,
+            "lyric": "레",
+            "startSeconds": 0.5,
+            "durationSeconds": 0.5,
+            "endSeconds": 1.0
+        },
+        {
+            "id": "note_2",
+            "start": 320,
+            "duration": 160,
+            "pitch": 64,  # E4
+            "velocity": 100,
+            "lyric": "미",
+            "startSeconds": 1.0,
+            "durationSeconds": 0.5,
+            "endSeconds": 1.5
+        },
+        {
+            "id": "note_3",
+            "start": 480,
+            "duration": 160,
+            "pitch": 67,  # G4
+            "velocity": 100,
+            "lyric": "솔",
+            "startSeconds": 1.5,
+            "durationSeconds": 0.5,
+            "endSeconds": 2.0
+        }
+    ]
+    # demo/app.py와 동일한 완전한 초기값 구조
+    initial_value = {
+        "notes": notes,
+        "tempo": 120,
+        "timeSignature": {"numerator": 4, "denominator": 4},
+        "editMode": "select",
+        "snapSetting": "1/4",
+        "pixelsPerBeat": 80,  # demo/app.py와 동일
+        "curve_data": {},
+        "use_backend_audio": True  # demo/app.py와 동일하게 True
+    }
+    print("🎼 예제 멜로디 생성됨")
+    return initial_value
+# Gradio 인터페이스
+with gr.Blocks(title="UTAU WebUI", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎵 UTAU WebUI - Gradio로 구현된 UTAU 음성 합성기")
+    gr.Markdown("피아노롤에서 노트를 그리고 '음성 합성' 버튼을 클릭하여 음성을 생성하세요.")
+    gr.Markdown("## 🎤 주의 사항")
+    gr.Markdown("""
+- 유닛 선택 알고리즘이 CVC 음원에 최적화 되지 않아서 발음 오류가 발생할 수 있습니다.
+- 일부 UI 오류가 있으며 현재 해결 중 입니다.
+- 템포가 적용되�� 않는 문제가 있습니다. 120bpm을 유지하면서 사용해주세요.""")
+    with gr.Row():
+        with gr.Column(scale=3):
+            # demo/app.py와 동일한 초기값 구조
+            initial_pianoroll_value = {
+                "notes": [],
+                "tempo": 120,
+                "timeSignature": {"numerator": 4, "denominator": 4},
+                "editMode": "select",
+                "snapSetting": "1/4",
+                "pixelsPerBeat": 80,
+                "curve_data": {},
+                "use_backend_audio": True  # demo/app.py와 동일하게 True
+            }
+            pianoroll = grp.PianoRoll(
+                width=1000,
+                height=800,
+                label="피아노롤 편집기",
+                value=initial_pianoroll_value,
+                elem_id="piano_roll_utau",  # 고유 ID 추가
+                use_backend_audio=True  # demo/app.py와 동일하게 True로 시작
+            )
+            with gr.Row():
+                clear_btn = gr.Button("🗑️ 초기화", size="sm")
+                example_btn = gr.Button("🎼 예제 멜로디", size="sm", variant="secondary")
+                info_text = gr.Markdown("**사용법:** 클릭하여 노트 추가, 드래그하여 길이 조정, 더블클릭하여 가사 입력")
+        with gr.Column(scale=1):
+            # 엔진 선택 및 가사 입력
+            gr.Markdown("### 🎤 음성 엔진")
+            with gr.Group():
+                # UTAU 엔진 선택지를 동적으로 생성
+                engine_choices = []
+                if USE_UTAU and utau_engine:
+                    engine_choices.append(f"UTAU 엔진 ({'hanseol CVC'})")
+                engine_choices.append("기본 엔진")
+                engine_radio = gr.Radio(
+                    choices=engine_choices,
+                    value=f"UTAU 엔진 ({'hanseol CVC'})" if USE_UTAU and utau_engine else "기본 엔진",
+                    label="합성 엔진",
+                    info="UTAU 엔진은 실제 보이스뱅크 사용"
+                )
+                # 가사는 피아노롤 노트에서 직접 입력
+                gr.Markdown("**가사 입력**: 피아노롤에서 노트를 더블클릭하여 가사를 입력하세요.")
+            synthesis_btn = gr.Button("🎵 음성 합성", variant="primary", size="lg")
+            status_text = gr.Textbox(
+                label="합성 상태",
+                value="노트를 추가하고 합성 버튼을 클릭하세요.",
+                interactive=False,
+                lines=2
+            )
+            audio_output = gr.Audio(
+                label="합성된 음성",
+                visible=True
+            )
+            gr.Markdown("### 📊 보이스뱅크 정보")
+            if USE_UTAU:
+                compression_info = utau_engine.get_compression_info()
+                gr.Markdown(f"""
+- **보이스뱅크:** hanseol CVC (압축된 HDF5 🗜️)
+- **CV:** KUNGOM
+- **UTAU:** KITANE 백한설
+- **사용 가능한 음소:** {len(utau_engine.get_available_phonemes())}개
+- **압축율:** {compression_info.get('compression_ratio', 0):.1f}%
+- **용량:** {compression_info.get('compressed_size_bytes', 0) / (1024*1024):.1f} MB""")
+            else:
+                gr.Markdown("""
+- **보이스뱅크:** ❌ 압축된 보이스뱅크 없음
+- **상태:** 제한된 모드로 실행 중
+- **해결책:** `make compress` 명령어로 보이스뱅크를 먼저 압축하세요.""")
+    # 이벤트 핸들러
+    synthesis_btn.click(
+        fn=synthesize_notes,
+        inputs=[
+            pianoroll,
+            engine_radio
+        ],
+        outputs=[pianoroll, audio_output, status_text]
+    )
+    def clear_pianoroll():
+        """피아노롤 초기화 - demo/app.py와 동일한 방식"""
+        initial_data = {
+            "notes": [],
+            "tempo": 120,
+            "timeSignature": {"numerator": 4, "denominator": 4},
+            "editMode": "select",
+            "snapSetting": "1/4",
+            "pixelsPerBeat": 80,  # demo/app.py와 동일
+            "curve_data": {},
+            "use_backend_audio": True  # demo/app.py와 동일하게 True 유지
+        }
+        print("🗑️ 피아노롤 초기화됨")
+        return initial_data
+    clear_btn.click(
+        fn=clear_pianoroll,
+        outputs=[pianoroll]
+    )
+    example_btn.click(
+        fn=create_example_melody,
+        outputs=[pianoroll]
+    )
+    # 초기 설정
+    create_test_voice_sample()
+    # playhead 동작을 위한 이벤트 핸들러 추가
+    def log_play_event(event_data=None):
+        print("🔊 Play event triggered:", event_data)
+        return f"재생 시작: {event_data if event_data else '재생 중'}"
+    def log_pause_event(event_data=None):
+        print("🔊 Pause event triggered:", event_data)
+        return f"일시정지: {event_data if event_data else '일시정지됨'}"
+    def log_stop_event(event_data=None):
+        print("🔊 Stop event triggered:", event_data)
+        return f"정지: {event_data if event_data else '정지됨'}"
+    # playhead 이벤트 핸들러 연결
+    pianoroll.play(log_play_event, outputs=status_text)
+    pianoroll.pause(log_pause_event, outputs=status_text)
+    pianoroll.stop(log_stop_event, outputs=status_text)
+if __name__ == "__main__":
+    demo.launch()

compressed_utau_engine.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import tempfile
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+import logging
+from straycat import Resampler
+from voice_data_converter import CompressedVoicebankManager, OtoEntry
+logger = logging.getLogger(__name__)
+class CompressedUTAUEngine:
+    """압축된 HDF5 보이스뱅크를 사용하는 UTAU 호환 음성 합성 엔진"""
+    def __init__(self, compressed_voicebank_path: Union[str, Path]):
+        self.voicebank = CompressedVoicebankManager(compressed_voicebank_path)
+        self.default_phoneme = "あ"  # 기본 음소
+        logger.info(f"압축된 UTAU 엔진 초기화 완료")
+    def synthesize_sequence(self,
+                          notes: List[Dict],
+                          lyrics: List[str],
+                          tempo: int = 120,
+                          volume: int = 100) -> Tuple[Optional[str], str]:
+        """노트 시퀀스와 가사로 음성 합성"""
+        if len(notes) != len(lyrics):
+            return None, "노트와 가사의 개수가 일치하지 않습니다."
+        if not notes:
+            return None, "합성할 노트가 없습니다."
+        try:
+            # 전체 시퀀스 길이 계산
+            max_end_time = max(note.get('endSeconds',
+                                      note.get('startSeconds', 0) + note.get('durationSeconds', 0.5))
+                              for note in notes)
+            sample_rate = 44100
+            total_samples = int(max_end_time * sample_rate) + sample_rate
+            final_audio = np.zeros(total_samples)
+            # 각 노트 합성
+            for i, (note, lyric) in enumerate(zip(notes, lyrics)):
+                try:
+                    # 음소 변환
+                    phoneme = self._lyric_to_phoneme(lyric)
+                    # oto 엔트리 찾기
+                    oto_entry = self.voicebank.get_sample_for_phoneme(phoneme)
+                    if not oto_entry:
+                        logger.warning(f"음소 '{phoneme}'에 해당하는 샘플을 찾을 수 없음")
+                        continue
+                    # 오디오 데이터 로드 (압축된 데이터에서)
+                    audio_result = self.voicebank.get_audio_data(oto_entry.filename)
+                    if not audio_result:
+                        logger.warning(f"오디오 파일 로드 실패: {oto_entry.filename}")
+                        continue
+                    source_audio, source_sample_rate = audio_result
+                    # 노트 합성
+                    synth_audio = self._synthesize_note(
+                        note, oto_entry, source_audio, source_sample_rate, tempo, volume
+                    )
+                    if synth_audio is not None:
+                        # 시간 위치 계산 및 오디오 배치
+                        start_sample = int(note.get('startSeconds', 0) * sample_rate)
+                        end_sample = start_sample + len(synth_audio)
+                        if end_sample <= len(final_audio):
+                            final_audio[start_sample:end_sample] += synth_audio * (note.get('velocity', 100) / 100)
+                        else:
+                            # 버퍼 확장
+                            new_size = end_sample + sample_rate
+                            new_final_audio = np.zeros(new_size)
+                            new_final_audio[:len(final_audio)] = final_audio
+                            new_final_audio[start_sample:end_sample] += synth_audio * (note.get('velocity', 100) / 100)
+                            final_audio = new_final_audio
+                        logger.info(f"노트 {i+1} 합성 완료: {phoneme}")
+                except Exception as e:
+                    logger.error(f"노트 {i+1} 합성 실패: {e}")
+                    continue
+            # 최종 오디오 정규화
+            if np.max(np.abs(final_audio)) > 0:
+                final_audio = final_audio / np.max(np.abs(final_audio)) * 0.85
+            # 임시 파일 저장
+            output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+            sf.write(output_file.name, final_audio, sample_rate)
+            output_file.close()
+            duration_sec = len(final_audio) / sample_rate
+            return output_file.name, f"✅ 압축된 보이스뱅크로 합성 완료: {len(notes)}개 노트, {duration_sec:.1f}초"
+        except Exception as e:
+            logger.error(f"시퀀스 합성 실패: {e}")
+            return None, f"❌ 합성 실패: {str(e)}"
+    def _lyric_to_phoneme(self, lyric: str) -> str:
+        """가사를 음소로 변환 (기존 로직과 동일)"""
+        lyric = lyric.strip()
+        if not lyric:
+            return self.default_phoneme
+        # 한글 → 일본어 음소 변환 (간단한 매핑)
+        hangul_to_japanese = {
+            '가': 'ka', '나': 'na', '다': 'da', '라': 'ra', '마': 'ma',
+            '바': 'ba', '사': 'sa', '아': 'a', '자': 'za', '차': 'cha',
+            '카': 'ka', '타': 'ta', '파': 'pa', '하': 'ha',
+            '거': 'ke', '너': 'ne', '더': 'de', '러': 're', '머': 'me',
+            '버': 'be', '서': 'se', '어': 'e', '저': 'ze', '처': 'che',
+            '커': 'ke', '터': 'te', '퍼': 'pe', '허': 'he',
+            '고': 'ko', '노': 'no', '도': 'do', '로': 'ro', '모': 'mo',
+            '보': 'bo', '소': 'so', '오': 'o', '조': 'zo', '초': 'cho',
+            '코': 'ko', '토': 'to', '포': 'po', '호': 'ho',
+            '구': 'ku', '누': 'nu', '두': 'du', '루': 'ru', '무': 'mu',
+            '부': 'bu', '수': 'su', '우': 'u', '주': 'zu', '추': 'chu',
+            '쿠': 'ku', '투': 'tu', '푸': 'pu', '후': 'hu',
+            '기': 'ki', '니': 'ni', '디': 'di', '리': 'ri', '미': 'mi',
+            '비': 'bi', '시': 'si', '이': 'i', '지': 'zi', '치': 'chi',
+            '키': 'ki', '티': 'ti', '피': 'pi', '히': 'hi',
+            '도': 'do', '레': 're', '미': 'mi', '파': 'pa', '솔': 'so', '라': 'ra', '시': 'si'
+        }
+        if lyric in hangul_to_japanese:
+            return hangul_to_japanese[lyric]
+        return lyric if lyric in self.voicebank.oto_entries else self.default_phoneme
+    def _synthesize_note(self,
+                        note: Dict,
+                        oto_entry: OtoEntry,
+                        source_audio: np.ndarray,
+                        source_sample_rate: int,
+                        tempo: int,
+                        volume: int) -> Optional[np.ndarray]:
+        """개별 노트 합성 (압축된 오디오 데이터 사용)"""
+        try:
+            # 임시 파일에 원본 오디오 저장
+            temp_input = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+            sf.write(temp_input.name, source_audio, source_sample_rate)
+            temp_input.close()
+            # 출력 파일
+            temp_output = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+            temp_output.close()
+            # 노트 정보 추출
+            pitch = note['pitch']
+            duration_ms = note.get('durationSeconds', 0.5) * 1000
+            velocity = note.get('velocity', 100)
+            # MIDI 노트를 음계명으로 변환
+            note_name = self._midi_to_note_name(pitch)
+            # straycat Resampler로 합성
+            resampler = Resampler(
+                in_file=temp_input.name,
+                out_file=temp_output.name,
+                pitch=note_name,
+                velocity=velocity,
+                length=max(duration_ms, 200),  # 최소 200ms
+                volume=volume,
+                offset=oto_entry.offset,
+                consonant=oto_entry.consonant,
+                cutoff=oto_entry.cutoff,
+                modulation=10,
+                tempo=f'!{tempo}'
+            )
+            # 합성된 오디오 로드
+            if Path(temp_output.name).exists():
+                synth_audio, _ = sf.read(temp_output.name)
+                # 정리
+                Path(temp_input.name).unlink(missing_ok=True)
+                Path(temp_output.name).unlink(missing_ok=True)
+                return synth_audio
+            else:
+                logger.error(f"합성된 파일이 생성되지 않음: {temp_output.name}")
+                return None
+        except Exception as e:
+            logger.error(f"노트 합성 실패: {e}")
+            return None
+    def _midi_to_note_name(self, midi_note: int) -> str:
+        """MIDI 노트 번호를 음계명으로 변환"""
+        notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
+        octave = (midi_note // 12) - 1
+        note = notes[midi_note % 12]
+        return f"{note}{octave}"
+    def get_available_phonemes(self) -> List[str]:
+        """사용 가능한 음소 목록 반환"""
+        return self.voicebank.list_available_phonemes()
+    def get_compression_info(self) -> Dict[str, any]:
+        """압축 정보 반환"""
+        return self.voicebank.get_compression_info()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,32 @@

+[project]
+name = "utau-webui"
+version = "0.1.0"
+description = "한국어 CVC 보이스뱅크를 사용한 웹 기반 UTAU 음성 합성기"
+readme = "README.md"
+requires-python = ">=3.12"
+authors = [
+    { name = "UTAU WebUI Team" }
+]
+keywords = ["utau", "voice-synthesis", "korean", "music", "audio"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: End Users/Desktop",
+    "Topic :: Multimedia :: Sound/Audio :: Speech",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+]
+dependencies = [
+    "gradio>=5.33.1",
+    "gradio-pianoroll>=0.0.8",
+    "h5py>=3.10.0",
+    "librosa>=0.11.0",
+    "llvmlite>=0.44.0",
+    "numba>=0.61.2",
+    "numpy>=2.2.0",
+    "pyworld>=0.3.5",
+    "resampy>=0.4.3",
+    "scipy>=1.15.3",
+    "setuptools>=80.9.0",
+    "soundfile>=0.12.1",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,809 @@

+# This file was autogenerated by uv via the following command:
+#    uv export -o requirements.txt
+aiofiles==24.1.0 \
+    --hash=sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c \
+    --hash=sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5
+    # via gradio
+annotated-types==0.7.0 \
+    --hash=sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53 \
+    --hash=sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89
+    # via pydantic
+anyio==4.9.0 \
+    --hash=sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028 \
+    --hash=sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+audioop-lts==0.2.1 ; python_full_version >= '3.13' \
+    --hash=sha256:05da64e73837f88ee5c6217d732d2584cf638003ac72df124740460531e95e47 \
+    --hash=sha256:120678b208cca1158f0a12d667af592e067f7a50df9adc4dc8f6ad8d065a93fb \
+    --hash=sha256:161249db9343b3c9780ca92c0be0d1ccbfecdbccac6844f3d0d44b9c4a00a17f \
+    --hash=sha256:2aeb6f96f7f6da80354330470b9134d81b4cf544cdd1c549f2f45fe964d28059 \
+    --hash=sha256:2bdb3b7912ccd57ea53197943f1bbc67262dcf29802c4a6df79ec1c715d45a78 \
+    --hash=sha256:3827e3fce6fee4d69d96a3d00cd2ab07f3c0d844cb1e44e26f719b34a5b15455 \
+    --hash=sha256:4a8dd6a81770f6ecf019c4b6d659e000dc26571b273953cef7cd1d5ce2ff3ae6 \
+    --hash=sha256:534ce808e6bab6adb65548723c8cbe189a3379245db89b9d555c4210b4aaa9b6 \
+    --hash=sha256:54cd4520fc830b23c7d223693ed3e1b4d464997dd3abc7c15dce9a1f9bd76ab2 \
+    --hash=sha256:56b7a0a4dba8e353436f31a932f3045d108a67b5943b30f85a5563f4d8488d77 \
+    --hash=sha256:5b7b4ff9de7a44e0ad2618afdc2ac920b91f4a6d3509520ee65339d4acde5abf \
+    --hash=sha256:64562c5c771fb0a8b6262829b9b4f37a7b886c01b4d3ecdbae1d629717db08b4 \
+    --hash=sha256:6e899eb8874dc2413b11926b5fb3857ec0ab55222840e38016a6ba2ea9b7d5e3 \
+    --hash=sha256:72e37f416adb43b0ced93419de0122b42753ee74e87070777b53c5d2241e7fab \
+    --hash=sha256:78bfb3703388c780edf900be66e07de5a3d4105ca8e8720c5c4d67927e0b15d0 \
+    --hash=sha256:a351af79edefc2a1bd2234bfd8b339935f389209943043913a919df4b0f13300 \
+    --hash=sha256:c45317debeb64002e980077642afbd977773a25fa3dfd7ed0c84dccfc1fafcb0 \
+    --hash=sha256:c589f06407e8340e81962575fcffbba1e92671879a221186c3d4662de9fe804e \
+    --hash=sha256:d1cd3c0b6f2ca25c7d2b1c3adeecbe23e65689839ba73331ebc7d893fcda7ffe \
+    --hash=sha256:d2d5434717f33117f29b5691fbdf142d36573d751716249a288fbb96ba26a281 \
+    --hash=sha256:d2de9b6fb8b1cf9f03990b299a9112bfdf8b86b6987003ca9e8a6c4f56d39543 \
+    --hash=sha256:d6bd20c7a10abcb0fb3d8aaa7508c0bf3d40dfad7515c572014da4b979d3310a \
+    --hash=sha256:e175350da05d2087e12cea8e72a70a1a8b14a17e92ed2022952a4419689ede5e \
+    --hash=sha256:e1af3ff32b8c38a7d900382646e91f2fc515fd19dea37e9392275a5cbfdbff63 \
+    --hash=sha256:e81268da0baa880431b68b1308ab7257eb33f356e57a5f9b1f915dfb13dd1387 \
+    --hash=sha256:f0ed1ad9bd862539ea875fb339ecb18fcc4148f8d9908f4502df28f94d23491a \
+    --hash=sha256:f0f2f336aa2aee2bce0b0dcc32bbba9178995454c7b979cf6ce086a8801e14c7 \
+    --hash=sha256:f24865991b5ed4b038add5edbf424639d1358144f4e2a3e7a84bc6ba23e35074 \
+    --hash=sha256:f51bb55122a89f7a0817d7ac2319744b4640b5b446c4c3efcea5764ea99ae509 \
+    --hash=sha256:f626a01c0a186b08f7ff61431c01c055961ee28769591efa8800beadd27a2959 \
+    --hash=sha256:fbae5d6925d7c26e712f0beda5ed69ebb40e14212c185d129b8dfbfcc335eb48 \
+    --hash=sha256:fd1345ae99e17e6910f47ce7d52673c6a1a70820d78b67de1b7abb3af29c426a \
+    --hash=sha256:ff3f97b3372c97782e9c6d3d7fdbe83bce8f70de719605bd7ee1839cd1ab360a
+    # via
+    #   gradio
+    #   standard-aifc
+    #   standard-sunau
+audioread==3.0.1 \
+    --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
+    --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
+    # via librosa
+certifi==2025.4.26 \
+    --hash=sha256:0a816057ea3cdefcef70270d2c515e4506bbc954f417fa5ade2021213bb8f0c6 \
+    --hash=sha256:30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==1.17.1 \
+    --hash=sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2 \
+    --hash=sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36 \
+    --hash=sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824 \
+    --hash=sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3 \
+    --hash=sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed \
+    --hash=sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8 \
+    --hash=sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903 \
+    --hash=sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683 \
+    --hash=sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9 \
+    --hash=sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c \
+    --hash=sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4 \
+    --hash=sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65 \
+    --hash=sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93 \
+    --hash=sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4 \
+    --hash=sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3 \
+    --hash=sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff \
+    --hash=sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5 \
+    --hash=sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd \
+    --hash=sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5 \
+    --hash=sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d \
+    --hash=sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e \
+    --hash=sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a \
+    --hash=sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99
+    # via soundfile
+charset-normalizer==3.4.2 \
+    --hash=sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7 \
+    --hash=sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0 \
+    --hash=sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b \
+    --hash=sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff \
+    --hash=sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e \
+    --hash=sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148 \
+    --hash=sha256:4e594135de17ab3866138f496755f302b72157d115086d100c3f19370839dd3a \
+    --hash=sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e \
+    --hash=sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63 \
+    --hash=sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c \
+    --hash=sha256:7222ffd5e4de8e57e03ce2cef95a4c43c98fcb72ad86909abdfc2c17d227fc1b \
+    --hash=sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0 \
+    --hash=sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0 \
+    --hash=sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1 \
+    --hash=sha256:a370b3e078e418187da8c3674eddb9d983ec09445c99a3a263c2011993522981 \
+    --hash=sha256:a955b438e62efdf7e0b7b52a64dc5c3396e2634baa62471768a64bc2adb73d5c \
+    --hash=sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980 \
+    --hash=sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7 \
+    --hash=sha256:bee093bf902e1d8fc0ac143c88902c3dfc8941f7ea1d6a8dd2bcb786d33db03d \
+    --hash=sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3 \
+    --hash=sha256:cf713fe9a71ef6fd5adf7a79670135081cd4431c2943864757f0fa3a65b1fafd \
+    --hash=sha256:d41c4d287cfc69060fa91cae9683eacffad989f1a10811995fa309df656ec214 \
+    --hash=sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c \
+    --hash=sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f \
+    --hash=sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691 \
+    --hash=sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf \
+    --hash=sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b \
+    --hash=sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a
+    # via requests
+click==8.2.1 ; sys_platform != 'emscripten' \
+    --hash=sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202 \
+    --hash=sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b
+    # via
+    #   typer
+    #   uvicorn
+colorama==0.4.6 ; sys_platform == 'win32' \
+    --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \
+    --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6
+    # via
+    #   click
+    #   tqdm
+decorator==5.2.1 \
+    --hash=sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360 \
+    --hash=sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a
+    # via librosa
+fastapi==0.115.12 \
+    --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \
+    --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d
+    # via gradio
+ffmpy==0.6.0 \
+    --hash=sha256:332dd93198a162db61e527e866a04578d3713e577bfe68f2ed26ba9d09dbc948 \
+    --hash=sha256:c8369bf45f8bd5285ebad94c4a789a79e7af86eded74c1f8c36eccf57aaea58c
+    # via gradio
+filelock==3.18.0 \
+    --hash=sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2 \
+    --hash=sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de
+    # via huggingface-hub
+fsspec==2025.5.1 \
+    --hash=sha256:24d3a2e663d5fc735ab256263c4075f374a174c3410c0b25e5bd1970bceaa462 \
+    --hash=sha256:2e55e47a540b91843b755e83ded97c6e897fa0942b11490113f09e9c443c2475
+    # via
+    #   gradio-client
+    #   huggingface-hub
+gradio==5.33.1 \
+    --hash=sha256:c4329b04280d62041fbf0113e94fb5c4d20e0555ce1ac69174bf98225350159b \
+    --hash=sha256:f74c737aa92fc02b4d7dca7e50ee13ddce548aa16c9fcbe907ceabf93722f94d
+    # via
+    #   gradio-pianoroll
+    #   utau-webui
+gradio-client==1.10.3 \
+    --hash=sha256:941e7f8d9a160f88487e9780a3db2736a40ea2b8b69d53ffdb306e47ef658b76 \
+    --hash=sha256:9e99b88e47f05dc3b68e40a3f3f83819f8d0ddcd43466ad385fe42e137825774
+    # via gradio
+gradio-pianoroll==0.0.8 \
+    --hash=sha256:26abd2c98ccb8bb30e8269324ca8675109a502e266c4e9c8bfff524d1a9c0218 \
+    --hash=sha256:f7ac6d27dab2873c35bba5041b94afc9159922be2a9cebe202e8a87f4ec79e86
+    # via utau-webui
+groovy==0.1.2 \
+    --hash=sha256:25c1dc09b3f9d7e292458aa762c6beb96ea037071bf5e917fc81fb78d2231083 \
+    --hash=sha256:7f7975bab18c729a257a8b1ae9dcd70b7cafb1720481beae47719af57c35fa64
+    # via gradio
+h11==0.16.0 \
+    --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \
+    --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86
+    # via
+    #   httpcore
+    #   uvicorn
+h5py==3.14.0 \
+    --hash=sha256:0cbd41f4e3761f150aa5b662df991868ca533872c95467216f2bec5fcad84882 \
+    --hash=sha256:2372116b2e0d5d3e5e705b7f663f7c8d96fa79a4052d250484ef91d24d6a08f4 \
+    --hash=sha256:554ef0ced3571366d4d383427c00c966c360e178b5fb5ee5bb31a435c424db0c \
+    --hash=sha256:6da62509b7e1d71a7d110478aa25d245dd32c8d9a1daee9d2a42dba8717b047a \
+    --hash=sha256:aa4b7bbce683379b7bf80aaba68e17e23396100336a8d500206520052be2f812 \
+    --hash=sha256:ae18e3de237a7a830adb76aaa68ad438d85fe6e19e0d99944a3ce46b772c69b3 \
+    --hash=sha256:bf4897d67e613ecf5bdfbdab39a1158a64df105827da70ea1d90243d796d367f \
+    --hash=sha256:d90e6445ab7c146d7f7981b11895d70bc1dd91278a4f9f9028bc0c95e4a53f13 \
+    --hash=sha256:e0045115d83272090b0717c555a31398c2c089b87d212ceba800d3dc5d952e23 \
+    --hash=sha256:e8cbaf6910fa3983c46172666b0b8da7b7bd90d764399ca983236f2400436eeb \
+    --hash=sha256:ef9603a501a04fcd0ba28dd8f0995303d26a77a980a1f9474b3417543d4c6174
+    # via utau-webui
+hf-xet==1.1.3 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
+    --hash=sha256:30c575a5306f8e6fda37edb866762140a435037365eba7a17ce7bd0bc0216a8b \
+    --hash=sha256:7c1a6aa6abed1f696f8099aa9796ca04c9ee778a58728a115607de9cc4638ff1 \
+    --hash=sha256:8203f52827e3df65981984936654a5b390566336956f65765a8aa58c362bb841 \
+    --hash=sha256:a5f09b1dd24e6ff6bcedb4b0ddab2d81824098bb002cf8b4ffa780545fa348c3 \
+    --hash=sha256:b578ae5ac9c056296bb0df9d018e597c8dc6390c5266f35b5c44696003cde9f3 \
+    --hash=sha256:b788a61977fbe6b5186e66239e2a329a3f0b7e7ff50dad38984c0c74f44aeca1 \
+    --hash=sha256:c3b508b5f583a75641aebf732853deb058953370ce8184f5dabc49f803b0819b \
+    --hash=sha256:fd2da210856444a34aad8ada2fc12f70dabed7cc20f37e90754d1d9b43bc0534
+    # via huggingface-hub
+httpcore==1.0.9 \
+    --hash=sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55 \
+    --hash=sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8
+    # via httpx
+httpx==0.28.1 \
+    --hash=sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc \
+    --hash=sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad
+    # via
+    #   gradio
+    #   gradio-client
+    #   safehttpx
+huggingface-hub==0.32.6 \
+    --hash=sha256:32cde9558c965477556edca72352621def7fbc42e167aaf33f4cdb9af65bb28b \
+    --hash=sha256:8e960f23dc57519c6c2a0bbc7e9bc030eaa14e7f2d61f8e68fd3d025dabed2fa
+    # via
+    #   gradio
+    #   gradio-client
+idna==3.10 \
+    --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
+    --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+jinja2==3.1.6 \
+    --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \
+    --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67
+    # via gradio
+joblib==1.5.1 \
+    --hash=sha256:4719a31f054c7d766948dcd83e9613686b27114f190f717cec7eaa2084f8a74a \
+    --hash=sha256:f4f86e351f39fe3d0d32a9f2c3d8af1ee4cec285aafcb27003dda5205576b444
+    # via
+    #   librosa
+    #   scikit-learn
+lazy-loader==0.4 \
+    --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \
+    --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
+    # via librosa
+librosa==0.11.0 \
+    --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
+    --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
+    # via utau-webui
+llvmlite==0.44.0 \
+    --hash=sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4 \
+    --hash=sha256:1d671a56acf725bf1b531d5ef76b86660a5ab8ef19bb6a46064a705c6ca80aad \
+    --hash=sha256:2fb7c4f2fb86cbae6dca3db9ab203eeea0e22d73b99bc2341cdf9de93612e930 \
+    --hash=sha256:319bddd44e5f71ae2689859b7203080716448a3cd1128fb144fe5c055219d516 \
+    --hash=sha256:46224058b13c96af1365290bdfebe9a6264ae62fb79b2b55693deed11657a8bf \
+    --hash=sha256:5f79a728e0435493611c9f405168682bb75ffd1fbe6fc360733b850c80a026db \
+    --hash=sha256:9c58867118bad04a0bb22a2e0068c693719658105e40009ffe95c7000fcde88e \
+    --hash=sha256:aa0097052c32bf721a4efc03bd109d335dfa57d9bffb3d4c24cc680711b8b4fc \
+    --hash=sha256:c0143a5ef336da14deaa8ec26c5449ad5b6a2b564df82fcef4be040b9cacfea9 \
+    --hash=sha256:d752f89e31b66db6f8da06df8b39f9b91e78c5feea1bf9e8c1fba1d1c24c065d \
+    --hash=sha256:eae7e2d4ca8f88f89d315b48c6b741dcb925d6a1042da694aa16ab3dd4cbd3a1
+    # via
+    #   numba
+    #   utau-webui
+markdown-it-py==3.0.0 ; sys_platform != 'emscripten' \
+    --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
+    --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
+    # via rich
+markupsafe==3.0.2 \
+    --hash=sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30 \
+    --hash=sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9 \
+    --hash=sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 \
+    --hash=sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028 \
+    --hash=sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557 \
+    --hash=sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a \
+    --hash=sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c \
+    --hash=sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c \
+    --hash=sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22 \
+    --hash=sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094 \
+    --hash=sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5 \
+    --hash=sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225 \
+    --hash=sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c \
+    --hash=sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87 \
+    --hash=sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf \
+    --hash=sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb \
+    --hash=sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48 \
+    --hash=sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c \
+    --hash=sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6 \
+    --hash=sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd \
+    --hash=sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1 \
+    --hash=sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d \
+    --hash=sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca \
+    --hash=sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a \
+    --hash=sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe \
+    --hash=sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8 \
+    --hash=sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f \
+    --hash=sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f \
+    --hash=sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0 \
+    --hash=sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79 \
+    --hash=sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430
+    # via
+    #   gradio
+    #   jinja2
+mdurl==0.1.2 ; sys_platform != 'emscripten' \
+    --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
+    --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
+    # via markdown-it-py
+msgpack==1.1.0 \
+    --hash=sha256:06f5fd2f6bb2a7914922d935d3b8bb4a7fff3a9a91cfce6d06c13bc42bec975b \
+    --hash=sha256:071603e2f0771c45ad9bc65719291c568d4edf120b44eb36324dcb02a13bfddf \
+    --hash=sha256:0907e1a7119b337971a689153665764adc34e89175f9a34793307d9def08e6ca \
+    --hash=sha256:0f92a83b84e7c0749e3f12821949d79485971f087604178026085f60ce109330 \
+    --hash=sha256:115a7af8ee9e8cddc10f87636767857e7e3717b7a2e97379dc2054712693e90f \
+    --hash=sha256:17fb65dd0bec285907f68b15734a993ad3fc94332b5bb21b0435846228de1f39 \
+    --hash=sha256:2137773500afa5494a61b1208619e3871f75f27b03bcfca7b3a7023284140247 \
+    --hash=sha256:398b713459fea610861c8a7b62a6fec1882759f308ae0795b5413ff6a160cf3c \
+    --hash=sha256:4676e5be1b472909b2ee6356ff425ebedf5142427842aa06b4dfd5117d1ca8a2 \
+    --hash=sha256:4a1964df7b81285d00a84da4e70cb1383f2e665e0f1f2a7027e683956d04b734 \
+    --hash=sha256:4b51405e36e075193bc051315dbf29168d6141ae2500ba8cd80a522964e31434 \
+    --hash=sha256:58dfc47f8b102da61e8949708b3eafc3504509a5728f8b4ddef84bd9e16ad420 \
+    --hash=sha256:59caf6a4ed0d164055ccff8fe31eddc0ebc07cf7326a2aaa0dbf7a4001cd823e \
+    --hash=sha256:5dbad74103df937e1325cc4bfeaf57713be0b4f15e1c2da43ccdd836393e2ea2 \
+    --hash=sha256:65553c9b6da8166e819a6aa90ad15288599b340f91d18f60b2061f402b9a4915 \
+    --hash=sha256:7a946a8992941fea80ed4beae6bff74ffd7ee129a90b4dd5cf9c476a30e9708d \
+    --hash=sha256:7c9a35ce2c2573bada929e0b7b3576de647b0defbd25f5139dcdaba0ae35a4cc \
+    --hash=sha256:a51abd48c6d8ac89e0cfd4fe177c61481aca2d5e7ba42044fd218cfd8ea9899f \
+    --hash=sha256:ad33e8400e4ec17ba782f7b9cf868977d867ed784a1f5f2ab46e7ba53b6e1e1b \
+    --hash=sha256:b4c01941fd2ff87c2a934ee6055bda4ed353a7846b8d4f341c428109e9fcde8c \
+    --hash=sha256:bce7d9e614a04d0883af0b3d4d501171fbfca038f12c77fa838d9f198147a23f \
+    --hash=sha256:d46cf9e3705ea9485687aa4001a76e44748b609d260af21c4ceea7f2212a501d \
+    --hash=sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e
+    # via librosa
+numba==0.61.2 \
+    --hash=sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2 \
+    --hash=sha256:3a10a8fc9afac40b1eac55717cece1b8b1ac0b946f5065c89e00bde646b5b154 \
+    --hash=sha256:48a53a3de8f8793526cbe330f2a39fe9a6638efcbf11bd63f3d2f9757ae345cd \
+    --hash=sha256:4ddce10009bc097b080fc96876d14c051cc0c7679e99de3e0af59014dab7dfe8 \
+    --hash=sha256:59321215e2e0ac5fa928a8020ab00b8e57cda8a97384963ac0dfa4d4e6aa54e7 \
+    --hash=sha256:5b1bb509d01f23d70325d3a5a0e237cbc9544dd50e50588bc581ba860c213546 \
+    --hash=sha256:5f154aaea625fb32cfbe3b80c5456d514d416fcdf79733dd69c0df3a11348e9e \
+    --hash=sha256:7d3bcada3c9afba3bed413fba45845f2fb9cd0d2b27dd58a1be90257e293d140 \
+    --hash=sha256:8750ee147940a6637b80ecf7f95062185ad8726c8c28a2295b8ec1160a196f7d \
+    --hash=sha256:97cf4f12c728cf77c9c1d7c23707e4d8fb4632b46275f8f3397de33e5877af18 \
+    --hash=sha256:bdbca73ad81fa196bd53dc12e3aaf1564ae036e0c125f237c7644fe64a4928ab
+    # via
+    #   librosa
+    #   resampy
+    #   utau-webui
+numpy==2.2.6 \
+    --hash=sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff \
+    --hash=sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84 \
+    --hash=sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6 \
+    --hash=sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f \
+    --hash=sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b \
+    --hash=sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49 \
+    --hash=sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571 \
+    --hash=sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff \
+    --hash=sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4 \
+    --hash=sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566 \
+    --hash=sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40 \
+    --hash=sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd \
+    --hash=sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06 \
+    --hash=sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282 \
+    --hash=sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3 \
+    --hash=sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1 \
+    --hash=sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c \
+    --hash=sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d \
+    --hash=sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2 \
+    --hash=sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c \
+    --hash=sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f \
+    --hash=sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd \
+    --hash=sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868 \
+    --hash=sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d \
+    --hash=sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87 \
+    --hash=sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa \
+    --hash=sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f \
+    --hash=sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda \
+    --hash=sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249 \
+    --hash=sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de \
+    --hash=sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8
+    # via
+    #   gradio
+    #   h5py
+    #   librosa
+    #   numba
+    #   pandas
+    #   pyworld
+    #   resampy
+    #   scikit-learn
+    #   scipy
+    #   soundfile
+    #   soxr
+    #   utau-webui
+orjson==3.10.18 \
+    --hash=sha256:0315317601149c244cb3ecef246ef5861a64824ccbcb8018d32c66a60a84ffbc \
+    --hash=sha256:187ec33bbec58c76dbd4066340067d9ece6e10067bb0cc074a21ae3300caa84e \
+    --hash=sha256:1ebeda919725f9dbdb269f59bc94f861afbe2a27dce5608cdba2d92772364d1c \
+    --hash=sha256:22748de2a07fcc8781a70edb887abf801bb6142e6236123ff93d12d92db3d406 \
+    --hash=sha256:2d808e34ddb24fc29a4d4041dcfafbae13e129c93509b847b14432717d94b44f \
+    --hash=sha256:303565c67a6c7b1f194c94632a4a39918e067bd6176a48bec697393865ce4f06 \
+    --hash=sha256:356b076f1662c9813d5fa56db7d63ccceef4c271b1fb3dd522aca291375fcf17 \
+    --hash=sha256:3a83c9954a4107b9acd10291b7f12a6b29e35e8d43a414799906ea10e75438e6 \
+    --hash=sha256:3d600be83fe4514944500fa8c2a0a77099025ec6482e8087d7659e891f23058a \
+    --hash=sha256:50c15557afb7f6d63bc6d6348e0337a880a04eaa9cd7c9d569bcb4e760a24753 \
+    --hash=sha256:559eb40a70a7494cd5beab2d73657262a74a2c59aff2068fdba8f0424ec5b39d \
+    --hash=sha256:5adf5f4eed520a4959d29ea80192fa626ab9a20b2ea13f8f6dc58644f6927103 \
+    --hash=sha256:6612787e5b0756a171c7d81ba245ef63a3533a637c335aa7fcb8e665f4a0966f \
+    --hash=sha256:69c34b9441b863175cc6a01f2935de994025e773f814412030f269da4f7be147 \
+    --hash=sha256:7592bb48a214e18cd670974f289520f12b7aed1fa0b2e2616b8ed9e069e08595 \
+    --hash=sha256:7ac6bd7be0dcab5b702c9d43d25e70eb456dfd2e119d512447468f6405b4a69c \
+    --hash=sha256:86314fdb5053a2f5a5d881f03fca0219bfdf832912aa88d18676a5175c6916b5 \
+    --hash=sha256:8e4b2ae732431127171b875cb2668f883e1234711d3c147ffd69fe5be51a8012 \
+    --hash=sha256:9dca85398d6d093dd41dc0983cbf54ab8e6afd1c547b6b8a311643917fbf4e0c \
+    --hash=sha256:9f72f100cee8dde70100406d5c1abba515a7df926d4ed81e20a9730c062fe9ad \
+    --hash=sha256:ad8eacbb5d904d5591f27dee4031e2c1db43d559edb8f91778efd642d70e6bea \
+    --hash=sha256:aed411bcb68bf62e85588f2a7e03a6082cc42e5a2796e06e72a962d7c6310b52 \
+    --hash=sha256:bb70d489bc79b7519e5803e2cc4c72343c9dc1154258adf2f8925d0b60da7c58 \
+    --hash=sha256:c382a5c0b5931a5fc5405053d36c1ce3fd561694738626c77ae0b1dfc0242ca1 \
+    --hash=sha256:e0da26957e77e9e55a6c2ce2e7182a36a6f6b180ab7189315cb0995ec362e049 \
+    --hash=sha256:e8da3947d92123eda795b68228cafe2724815621fe35e8e320a9e9593a4bcd53 \
+    --hash=sha256:e9e86a6af31b92299b00736c89caf63816f70a4001e750bda179e15564d7a034 \
+    --hash=sha256:f3c29eb9a81e2fbc6fd7ddcfba3e101ba92eaff455b8d602bf7511088bbc0eae \
+    --hash=sha256:f54c1385a0e6aba2f15a40d703b858bedad36ded0491e55d35d905b2c34a4cc3 \
+    --hash=sha256:f872bef9f042734110642b7a11937440797ace8c87527de25e0c53558b579ccc \
+    --hash=sha256:f9f94cf6d3f9cd720d641f8399e390e7411487e493962213390d1ae45c7814fc
+    # via gradio
+packaging==25.0 \
+    --hash=sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 \
+    --hash=sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   lazy-loader
+    #   pooch
+pandas==2.3.0 \
+    --hash=sha256:094e271a15b579650ebf4c5155c05dcd2a14fd4fdd72cf4854b2f7ad31ea30be \
+    --hash=sha256:1a881bc1309f3fce34696d07b00f13335c41f5f5a8770a33b09ebe23261cfc67 \
+    --hash=sha256:1d2b33e68d0ce64e26a4acc2e72d747292084f4e8db4c847c6f5f6cbe56ed6d8 \
+    --hash=sha256:213cd63c43263dbb522c1f8a7c9d072e25900f6975596f883f4bebd77295d4f3 \
+    --hash=sha256:2c7e2fc25f89a49a11599ec1e76821322439d90820108309bf42130d2f36c983 \
+    --hash=sha256:2eb4728a18dcd2908c7fccf74a982e241b467d178724545a48d0caf534b38ebf \
+    --hash=sha256:34600ab34ebf1131a7613a260a61dbe8b62c188ec0ea4c296da7c9a06b004133 \
+    --hash=sha256:404d681c698e3c8a40a61d0cd9412cc7364ab9a9cc6e144ae2992e11a2e77a20 \
+    --hash=sha256:430a63bae10b5086995db1b02694996336e5a8ac9a96b4200572b413dfdfccb9 \
+    --hash=sha256:4930255e28ff5545e2ca404637bcc56f031893142773b3468dc021c6c32a1390 \
+    --hash=sha256:6021910b086b3ca756755e86ddc64e0ddafd5e58e076c72cb1585162e5ad259b \
+    --hash=sha256:951805d146922aed8357e4cc5671b8b0b9be1027f0619cea132a9f3f65f2f09c \
+    --hash=sha256:9ff730713d4c4f2f1c860e36c005c7cefc1c7c80c21c0688fd605aa43c9fcf09 \
+    --hash=sha256:b9d8c3187be7479ea5c3d30c32a5d73d62a621166675063b2edd21bc47614027 \
+    --hash=sha256:ba24af48643b12ffe49b27065d3babd52702d95ab70f50e1b34f71ca703e2c0d \
+    --hash=sha256:bb32dc743b52467d488e7a7c8039b821da2826a9ba4f85b89ea95274f863280f \
+    --hash=sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249 \
+    --hash=sha256:c6da97aeb6a6d233fb6b17986234cc723b396b50a3c6804776351994f2a658fd \
+    --hash=sha256:e1991bbb96f4050b09b5f811253c4f3cf05ee89a589379aa36cd623f21a31d6f \
+    --hash=sha256:e78ad363ddb873a631e92a3c063ade1ecfb34cae71e9a2be6ad100f875ac1042 \
+    --hash=sha256:f925f1ef673b4bd0271b1809b72b3270384f2b7d9d14a189b12b7fc02574d575
+    # via gradio
+pillow==11.2.1 \
+    --hash=sha256:062b7a42d672c45a70fa1f8b43d1d38ff76b63421cbbe7f88146b39e8a558d91 \
+    --hash=sha256:0c7b29dbd4281923a2bfe562acb734cee96bbb129e96e6972d315ed9f232bef4 \
+    --hash=sha256:14e33b28bf17c7a38eede290f77db7c664e4eb01f7869e37fa98a5aa95978941 \
+    --hash=sha256:191955c55d8a712fab8934a42bfefbf99dd0b5875078240943f913bb66d46d9f \
+    --hash=sha256:1d535df14716e7f8776b9e7fee118576d65572b4aad3ed639be9e4fa88a1cad3 \
+    --hash=sha256:21e1470ac9e5739ff880c211fc3af01e3ae505859392bf65458c224d0bf283eb \
+    --hash=sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681 \
+    --hash=sha256:25a5f306095c6780c52e6bbb6109624b95c5b18e40aab1c3041da3e9e0cd3e2d \
+    --hash=sha256:31df6e2d3d8fc99f993fd253e97fae451a8db2e7207acf97859732273e108406 \
+    --hash=sha256:36d6b82164c39ce5482f649b437382c0fb2395eabc1e2b1702a6deb8ad647d6e \
+    --hash=sha256:3e645b020f3209a0181a418bffe7b4a93171eef6c4ef6cc20980b30bebf17b7d \
+    --hash=sha256:3fe735ced9a607fee4f481423a9c36701a39719252a9bb251679635f99d0f7d2 \
+    --hash=sha256:4eb92eca2711ef8be42fd3f67533765d9fd043b8c80db204f16c8ea62ee1a751 \
+    --hash=sha256:5119225c622403afb4b44bad4c1ca6c1f98eed79db8d3bc6e4e160fc6339d66c \
+    --hash=sha256:598174aef4589af795f66f9caab87ba4ff860ce08cd5bb447c6fc553ffee603c \
+    --hash=sha256:63b5dff3a68f371ea06025a1a6966c9a1e1ee452fc8020c2cd0ea41b83e9037b \
+    --hash=sha256:74ee3d7ecb3f3c05459ba95eed5efa28d6092d751ce9bf20e3e253a4e497e691 \
+    --hash=sha256:750f96efe0597382660d8b53e90dd1dd44568a8edb51cb7f9d5d918b80d4de14 \
+    --hash=sha256:78092232a4ab376a35d68c4e6d5e00dfd73454bd12b230420025fbe178ee3b0b \
+    --hash=sha256:78afba22027b4accef10dbd5eed84425930ba41b3ea0a86fa8d20baaf19d807f \
+    --hash=sha256:7bdb5e09068332578214cadd9c05e3d64d99e0e87591be22a324bdbc18925be0 \
+    --hash=sha256:8ce2e8411c7aaef53e6bb29fe98f28cd4fbd9a1d9be2eeea434331aac0536b22 \
+    --hash=sha256:9622e3b6c1d8b551b6e6f21873bdcc55762b4b2126633014cea1803368a9aa16 \
+    --hash=sha256:9ee66787e095127116d91dea2143db65c7bb1e232f617aa5957c0d9d2a3f23a7 \
+    --hash=sha256:a64dd61998416367b7ef979b73d3a85853ba9bec4c2925f74e588879a58716b6 \
+    --hash=sha256:ad275964d52e2243430472fc5d2c2334b4fc3ff9c16cb0a19254e25efa03a155 \
+    --hash=sha256:b0e130705d568e2f43a17bcbe74d90958e8a16263868a12c3e0d9c8162690830 \
+    --hash=sha256:b2dbea1012ccb784a65349f57bbc93730b96e85b42e9bf7b01ef40443db720b4 \
+    --hash=sha256:d189ba1bebfbc0c0e529159631ec72bb9e9bc041f01ec6d3233d6d82eb823bc1 \
+    --hash=sha256:da3104c57bbd72948d75f6a9389e6727d2ab6333c3617f0a89d72d4940aa0443 \
+    --hash=sha256:e0b55f27f584ed623221cfe995c912c61606be8513bfa0e07d2c674b4516d9dd \
+    --hash=sha256:f91ebf30830a48c825590aede79376cb40f110b387c17ee9bd59932c961044f9 \
+    --hash=sha256:fdec757fea0b793056419bca3e9932eb2b0ceec90ef4813ea4c1e072c389eb28 \
+    --hash=sha256:fe15238d3798788d00716637b3d4e7bb6bde18b26e5d08335a96e88564a36b6b
+    # via gradio
+platformdirs==4.3.8 \
+    --hash=sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc \
+    --hash=sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4
+    # via pooch
+pooch==1.8.2 \
+    --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
+    --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
+    # via librosa
+pycparser==2.22 \
+    --hash=sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6 \
+    --hash=sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc
+    # via cffi
+pydantic==2.11.5 \
+    --hash=sha256:7f853db3d0ce78ce8bbb148c401c2cdd6431b3473c0cdff2755c7690952a7b7a \
+    --hash=sha256:f9c26ba06f9747749ca1e5c94d6a85cb84254577553c8785576fd38fa64dc0f7
+    # via
+    #   fastapi
+    #   gradio
+pydantic-core==2.33.2 \
+    --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \
+    --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \
+    --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \
+    --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \
+    --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \
+    --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \
+    --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \
+    --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \
+    --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \
+    --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \
+    --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \
+    --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \
+    --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \
+    --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \
+    --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \
+    --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \
+    --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \
+    --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \
+    --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \
+    --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \
+    --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \
+    --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \
+    --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \
+    --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \
+    --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \
+    --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \
+    --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \
+    --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \
+    --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \
+    --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \
+    --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \
+    --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6
+    # via pydantic
+pydub==0.25.1 \
+    --hash=sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6 \
+    --hash=sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f
+    # via gradio
+pygments==2.19.1 ; sys_platform != 'emscripten' \
+    --hash=sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f \
+    --hash=sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c
+    # via rich
+python-dateutil==2.9.0.post0 \
+    --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \
+    --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427
+    # via pandas
+python-multipart==0.0.20 \
+    --hash=sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104 \
+    --hash=sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13
+    # via gradio
+pytz==2025.2 \
+    --hash=sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3 \
+    --hash=sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00
+    # via pandas
+pyworld==0.3.5 \
+    --hash=sha256:1b93e53cddb67a0e4faa34d6cf919ac6c662feb1c8c0ed901d71b595ab396aa3 \
+    --hash=sha256:59b48961c2ac34fb01efeb1a77d3eda69c41b676858cbc3a82dfb7602c0c762b \
+    --hash=sha256:860c5c3528f1dbc5c68fa71a16e3bb6990244619e5b9baf62952f3a6bfc6131c
+    # via utau-webui
+pyyaml==6.0.2 \
+    --hash=sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48 \
+    --hash=sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133 \
+    --hash=sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484 \
+    --hash=sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5 \
+    --hash=sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc \
+    --hash=sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1 \
+    --hash=sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652 \
+    --hash=sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 \
+    --hash=sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 \
+    --hash=sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 \
+    --hash=sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563 \
+    --hash=sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b \
+    --hash=sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425 \
+    --hash=sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183 \
+    --hash=sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab \
+    --hash=sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 \
+    --hash=sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e \
+    --hash=sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4 \
+    --hash=sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba
+    # via
+    #   gradio
+    #   huggingface-hub
+requests==2.32.4 \
+    --hash=sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c \
+    --hash=sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422
+    # via
+    #   huggingface-hub
+    #   pooch
+resampy==0.4.3 \
+    --hash=sha256:a0d1c28398f0e55994b739650afef4e3974115edbe96cd4bb81968425e916e47 \
+    --hash=sha256:ad2ed64516b140a122d96704e32bc0f92b23f45419e8b8f478e5a05f83edcebd
+    # via utau-webui
+rich==14.0.0 ; sys_platform != 'emscripten' \
+    --hash=sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0 \
+    --hash=sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725
+    # via typer
+ruff==0.11.13 ; sys_platform != 'emscripten' \
+    --hash=sha256:1808b3ed53e1a777c2ef733aca9051dc9bf7c99b26ece15cb59a0320fbdbd629 \
+    --hash=sha256:26816a218ca6ef02142343fd24c70f7cd8c5aa6c203bca284407adf675984432 \
+    --hash=sha256:26fa247dc68d1d4e72c179e08889a25ac0c7ba4d78aecfc835d49cbfd60bf514 \
+    --hash=sha256:29c3189895a8a6a657b7af4e97d330c8a3afd2c9c8f46c81e2fc5a31866517e3 \
+    --hash=sha256:4a9ddd3ec62a9a89578c85842b836e4ac832d4a2e0bfaad3b02243f930ceafcc \
+    --hash=sha256:4bdfbf1240533f40042ec00c9e09a3aade6f8c10b6414cf11b519488d2635d46 \
+    --hash=sha256:4ffbc82d70424b275b089166310448051afdc6e914fdab90e08df66c43bb5ca9 \
+    --hash=sha256:51c3f95abd9331dc5b87c47ac7f376db5616041173826dfd556cfe3d4977f492 \
+    --hash=sha256:53b15a9dfdce029c842e9a5aebc3855e9ab7771395979ff85b7c1dedb53ddc2b \
+    --hash=sha256:55e4bc3a77842da33c16d55b32c6cac1ec5fb0fbec9c8c513bdce76c4f922165 \
+    --hash=sha256:633bf2c6f35678c56ec73189ba6fa19ff1c5e4807a78bf60ef487b9dd272cc71 \
+    --hash=sha256:6c51f93029d54a910d3d24f7dd0bb909e31b6cd989a5e4ac513f4eb41629f0dc \
+    --hash=sha256:96c27935418e4e8e77a26bb05962817f28b8ef3843a6c6cc49d8783b5507f250 \
+    --hash=sha256:ab153241400789138d13f362c43f7edecc0edfffce2afa6a68434000ecd8f69a \
+    --hash=sha256:aef9c9ed1b5ca28bb15c7eac83b8670cf3b20b478195bd49c8d756ba0a36cf48 \
+    --hash=sha256:b4385285e9179d608ff1d2fb9922062663c658605819a6876d8beef0c30b7f3b \
+    --hash=sha256:d237a496e0778d719efb05058c64d28b757c77824e04ffe8796c7436e26712b7 \
+    --hash=sha256:d28ce58b5ecf0f43c1b71edffabe6ed7f245d5336b17805803312ec9bc665933
+    # via gradio
+safehttpx==0.1.6 \
+    --hash=sha256:407cff0b410b071623087c63dd2080c3b44dc076888d8c5823c00d1e58cb381c \
+    --hash=sha256:b356bfc82cee3a24c395b94a2dbeabbed60aff1aa5fa3b5fe97c4f2456ebce42
+    # via gradio
+scikit-learn==1.7.0 \
+    --hash=sha256:014e07a23fe02e65f9392898143c542a50b6001dbe89cb867e19688e468d049b \
+    --hash=sha256:0521cb460426c56fee7e07f9365b0f45ec8ca7b2d696534ac98bfb85e7ae4775 \
+    --hash=sha256:0b2f8a0b1e73e9a08b7cc498bb2aeab36cdc1f571f8ab2b35c6e5d1c7115d97d \
+    --hash=sha256:126c09740a6f016e815ab985b21e3a0656835414521c81fc1a8da78b679bdb75 \
+    --hash=sha256:1babf2511e6ffd695da7a983b4e4d6de45dce39577b26b721610711081850906 \
+    --hash=sha256:317ca9f83acbde2883bd6bb27116a741bfcb371369706b4f9973cf30e9a03b0d \
+    --hash=sha256:34cc8d9d010d29fb2b7cbcd5ccc24ffdd80515f65fe9f1e4894ace36b267ce19 \
+    --hash=sha256:5abd2acff939d5bd4701283f009b01496832d50ddafa83c90125a4e41c33e314 \
+    --hash=sha256:5b7974f1f32bc586c90145df51130e02267e4b7e77cab76165c76cf43faca0d9 \
+    --hash=sha256:63017a5f9a74963d24aac7590287149a8d0f1a0799bbe7173c0d8ba1523293c0 \
+    --hash=sha256:9f39f6a811bf3f15177b66c82cbe0d7b1ebad9f190737dcdef77cfca1ea3c19c \
+    --hash=sha256:c01e869b15aec88e2cdb73d27f15bdbe03bce8e2fb43afbe77c45d399e73a5a3 \
+    --hash=sha256:c2c7243d34aaede0efca7a5a96d67fddaebb4ad7e14a70991b9abee9dc5c0379 \
+    --hash=sha256:e39d95a929b112047c25b775035c8c234c5ca67e681ce60d12413afb501129f7 \
+    --hash=sha256:e7e7ced20582d3a5516fb6f405fd1d254e1f5ce712bfef2589f51326af6346e8
+    # via librosa
+scipy==1.15.3 \
+    --hash=sha256:05dc6abcd105e1a29f95eada46d4a3f251743cfd7d3ae8ddb4088047f24ea477 \
+    --hash=sha256:06efcba926324df1696931a57a176c80848ccd67ce6ad020c810736bfd58eb1c \
+    --hash=sha256:0a769105537aa07a69468a0eefcd121be52006db61cdd8cac8a0e68980bbb723 \
+    --hash=sha256:0bdd905264c0c9cfa74a4772cdb2070171790381a5c4d312c973382fc6eaf730 \
+    --hash=sha256:0ff17c0bb1cb32952c09217d8d1eed9b53d1463e5f1dd6052c7857f83127d539 \
+    --hash=sha256:14ed70039d182f411ffc74789a16df3835e05dc469b898233a245cdfd7f162cb \
+    --hash=sha256:185cd3d6d05ca4b44a8f1595af87f9c372bb6acf9c808e99aa3e9aa03bd98cf6 \
+    --hash=sha256:271e3713e645149ea5ea3e97b57fdab61ce61333f97cfae392c28ba786f9bb49 \
+    --hash=sha256:2c620736bcc334782e24d173c0fdbb7590a0a436d2fdf39310a8902505008759 \
+    --hash=sha256:3ac07623267feb3ae308487c260ac684b32ea35fd81e12845039952f558047b8 \
+    --hash=sha256:40e54d5c7e7ebf1aa596c374c49fa3135f04648a0caabcb66c52884b943f02b4 \
+    --hash=sha256:50f9e62461c95d933d5c5ef4a1f2ebf9a2b4e83b0db374cb3f1de104d935922e \
+    --hash=sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed \
+    --hash=sha256:5e721fed53187e71d0ccf382b6bf977644c533e506c4d33c3fb24de89f5c3ed5 \
+    --hash=sha256:6487aa99c2a3d509a5227d9a5e889ff05830a06b2ce08ec30df6d79db5fcd5c5 \
+    --hash=sha256:6ac6310fdbfb7aa6612408bd2f07295bcbd3fda00d2d702178434751fe48e019 \
+    --hash=sha256:6cfd56fc1a8e53f6e89ba3a7a7251f7396412d655bca2aa5611c8ec9a6784a1e \
+    --hash=sha256:76ad1fb5f8752eabf0fa02e4cc0336b4e8f021e2d5f061ed37d6d264db35e3ca \
+    --hash=sha256:79167bba085c31f38603e11a267d862957cbb3ce018d8b38f79ac043bc92d825 \
+    --hash=sha256:7e11270a000969409d37ed399585ee530b9ef6aa99d50c019de4cb01e8e54e62 \
+    --hash=sha256:8c9ed3ba2c8a2ce098163a9bdb26f891746d02136995df25227a20e71c396ebb \
+    --hash=sha256:9db984639887e3dffb3928d118145ffe40eff2fa40cb241a306ec57c219ebbbb \
+    --hash=sha256:b90ab29d0c37ec9bf55424c064312930ca5f4bde15ee8619ee44e69319aab163 \
+    --hash=sha256:c05045d8b9bfd807ee1b9f38761993297b10b245f012b11b13b91ba8945f7e45 \
+    --hash=sha256:c9deabd6d547aee2c9a81dee6cc96c6d7e9a9b1953f74850c179f91fdc729cb7 \
+    --hash=sha256:dde4fc32993071ac0c7dd2d82569e544f0bdaff66269cb475e0f369adad13f11 \
+    --hash=sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf \
+    --hash=sha256:f77f853d584e72e874d87357ad70f44b437331507d1c311457bed8ed2b956126
+    # via
+    #   librosa
+    #   scikit-learn
+    #   utau-webui
+semantic-version==2.10.0 \
+    --hash=sha256:bdabb6d336998cbb378d4b9db3a4b56a1e3235701dc05ea2690d9a997ed5041c \
+    --hash=sha256:de78a3b8e0feda74cabc54aab2da702113e33ac9d9eb9d2389bcf1f58b7d9177
+    # via gradio
+setuptools==80.9.0 \
+    --hash=sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922 \
+    --hash=sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c
+    # via utau-webui
+shellingham==1.5.4 ; sys_platform != 'emscripten' \
+    --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \
+    --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de
+    # via typer
+six==1.17.0 \
+    --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
+    --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
+    # via python-dateutil
+sniffio==1.3.1 \
+    --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \
+    --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc
+    # via anyio
+soundfile==0.13.1 \
+    --hash=sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618 \
+    --hash=sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9 \
+    --hash=sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593 \
+    --hash=sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33 \
+    --hash=sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb \
+    --hash=sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445 \
+    --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \
+    --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
+    # via
+    #   librosa
+    #   utau-webui
+soxr==0.5.0.post1 \
+    --hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \
+    --hash=sha256:7092b9f3e8a416044e1fa138c8172520757179763b85dc53aa9504f4813cff73 \
+    --hash=sha256:a3f16810dd649ab1f433991d2a9661e9e6a116c2b4101039b53b3c3e90a094fc \
+    --hash=sha256:b1be9fee90afb38546bdbd7bde714d1d9a8c5a45137f97478a83b65e7f3146f6 \
+    --hash=sha256:bd052a66471a7335b22a6208601a9d0df7b46b8d087dce4ff6e13eed6a33a2a1 \
+    --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
+    # via librosa
+standard-aifc==3.13.0 ; python_full_version >= '3.13' \
+    --hash=sha256:64e249c7cb4b3daf2fdba4e95721f811bde8bdfc43ad9f936589b7bb2fae2e43 \
+    --hash=sha256:f7ae09cc57de1224a0dd8e3eb8f73830be7c3d0bc485de4c1f82b4a7f645ac66
+    # via librosa
+standard-chunk==3.13.0 ; python_full_version >= '3.13' \
+    --hash=sha256:17880a26c285189c644bd5bd8f8ed2bdb795d216e3293e6dbe55bbd848e2982c \
+    --hash=sha256:4ac345d37d7e686d2755e01836b8d98eda0d1a3ee90375e597ae43aaf064d654
+    # via standard-aifc
+standard-sunau==3.13.0 ; python_full_version >= '3.13' \
+    --hash=sha256:53af624a9529c41062f4c2fd33837f297f3baa196b0cfceffea6555654602622 \
+    --hash=sha256:b319a1ac95a09a2378a8442f403c66f4fd4b36616d6df6ae82b8e536ee790908
+    # via librosa
+starlette==0.46.2 \
+    --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \
+    --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5
+    # via
+    #   fastapi
+    #   gradio
+threadpoolctl==3.6.0 \
+    --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
+    --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
+    # via scikit-learn
+tomlkit==0.13.3 \
+    --hash=sha256:430cf247ee57df2b94ee3fbe588e71d362a941ebb545dec29b53961d61add2a1 \
+    --hash=sha256:c89c649d79ee40629a9fda55f8ace8c6a1b42deb912b2a8fd8d942ddadb606b0
+    # via gradio
+tqdm==4.67.1 \
+    --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \
+    --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2
+    # via huggingface-hub
+typer==0.16.0 ; sys_platform != 'emscripten' \
+    --hash=sha256:1f79bed11d4d02d4310e3c1b7ba594183bcedb0ac73b27a9e5f28f6fb5b98855 \
+    --hash=sha256:af377ffaee1dbe37ae9440cb4e8f11686ea5ce4e9bae01b84ae7c63b87f1dd3b
+    # via gradio
+typing-extensions==4.14.0 \
+    --hash=sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4 \
+    --hash=sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af
+    # via
+    #   anyio
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   librosa
+    #   pydantic
+    #   pydantic-core
+    #   typer
+    #   typing-inspection
+typing-inspection==0.4.1 \
+    --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \
+    --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28
+    # via pydantic
+tzdata==2025.2 \
+    --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \
+    --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9
+    # via pandas
+urllib3==2.4.0 \
+    --hash=sha256:414bc6535b787febd7567804cc015fee39daab8ad86268f1310a9250697de466 \
+    --hash=sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813
+    # via
+    #   gradio
+    #   requests
+uvicorn==0.34.3 ; sys_platform != 'emscripten' \
+    --hash=sha256:16246631db62bdfbf069b0645177d6e8a77ba950cfedbfd093acef9444e4d885 \
+    --hash=sha256:35919a9a979d7a59334b6b10e05d77c1d0d574c50e0fc98b8b1a0f165708b55a
+    # via gradio
+websockets==15.0.1 \
+    --hash=sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2 \
+    --hash=sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5 \
+    --hash=sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8 \
+    --hash=sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375 \
+    --hash=sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597 \
+    --hash=sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f \
+    --hash=sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3 \
+    --hash=sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4 \
+    --hash=sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665 \
+    --hash=sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22 \
+    --hash=sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675 \
+    --hash=sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4 \
+    --hash=sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65 \
+    --hash=sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151 \
+    --hash=sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d \
+    --hash=sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee \
+    --hash=sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa \
+    --hash=sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9 \
+    --hash=sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe \
+    --hash=sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561 \
+    --hash=sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215 \
+    --hash=sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931 \
+    --hash=sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f \
+    --hash=sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7
+    # via gradio-client

straycat.py ADDED Viewed

	@@ -0,0 +1,825 @@

+#!/usr/bin/env python3
+"""
+straycat - Yet another WORLD-based UTAU resampler.
+Original source: https://github.com/UtaUtaUtau/straycat
+Copyright (c) UtaUtaUtau
+Licensed under MIT License
+This file is part of the straycat project and is used under the terms
+of the MIT License. See the original repository for full license text.
+"""
+import logging
+logging.basicConfig(format='%(message)s', level=logging.INFO)
+import sys
+import os
+import pyworld as world # Vocoder
+import numpy as np # Numpy <3
+from numba import njit, vectorize, float64, optional # JIT compilation stuff (and ufuncs)
+import soundfile as sf # WAV read + write
+import scipy.signal as signal # for filtering
+import scipy.interpolate as interp # Interpolator for feats
+import scipy.ndimage as ndimage
+import resampy # Resampler (as in sampling rate stuff)
+from pathlib import Path # path manipulation
+import re
+version = '0.4.0'
+help_string = '''usage: straycat in_file out_file pitch velocity [flags] [offset] [length] [consonant] [cutoff] [volume] [modulation] [tempo] [pitch_string]
+Resamples using the WORLD Vocoder.
+arguments:
+\tin_file\t\tPath to input file.
+\tout_file\tPath to output file.
+\tpitch\t\tThe pitch to render on.
+\tvelocity\tThe consonant velocity of the render.
+optional arguments:
+\tflags\t\tThe flags of the render.
+\toffset\t\tThe offset from the start of the render area of the sample. (default: 0)
+\tlength\t\tThe length of the stretched area in milliseconds. (default: 1000)
+\tconsonant\tThe unstretched area of the render in milliseconds. (default: 0)
+\tcutoff\t\tThe cutoff from the end or from the offset for the render area of the sample. (default: 0)
+\tvolume\t\tThe volume of the render in percentage. (default: 100)
+\tmodulation\tThe pitch modulation of the render in percentage. (default: 0)
+\ttempo\t\tThe tempo of the render. Needs to have a ! at the start. (default: !100)
+\tpitch_string\tThe UTAU pitchbend parameter written in Base64 with RLE encoding. (default: AA)'''
+notes = {'C' : 0, 'C#' : 1, 'D' : 2, 'D#' : 3, 'E' : 4, 'F' : 5, 'F#' : 6, 'G' : 7, 'G#' : 8, 'A' : 9, 'A#' : 10, 'B' : 11} # Note names lol
+note_re = re.compile(r'([A-G]#?)(-?\d+)') # Note Regex for conversion
+default_fs = 44100 # UTAU only really likes 44.1khz
+fft_size = world.get_cheaptrick_fft_size(default_fs, world.default_f0_floor) # It's just 2048 but you know
+cache_ext = '.sc.npz' # cache file extension
+# Giving it better range
+f0_floor = world.default_f0_floor
+f0_ceil = 1760
+# Flags
+flags = ['fe', 'fl', 'fo', 'fv', 'fp', 've', 'vo', 'g', 't', 'A', 'B', 'G', 'P', 'S', 'p', 'R', 'D', 'C']
+flag_re = '|'.join(flags)
+flag_re = f'({flag_re})([+-]?\\d+)?'
+flag_re = re.compile(flag_re)
+# Utility functions
+@vectorize([float64(float64, float64, float64)], nopython=True)
+def smoothstep(edge0, edge1, x):
+    """Smoothstep function from GLSL that works with numpy arrays."""
+    x = (x - edge0) / (edge1 - edge0)
+    if x < 0:
+        x = 0
+    elif x > 1:
+        x = 1
+    return 3*x*x - 2*x*x*x
+@vectorize([float64(float64, float64, float64)], nopython=True)
+def clip(x, x_min, x_max):
+    """Clips function. Faster than np.clip somehow"""
+    if x < x_min:
+        return x_min
+    if x > x_max:
+        return x_max
+    return x
+@vectorize([float64(float64, float64)], nopython=True)
+def bias(x, a):
+    """Element-wise Schlick bias function."""
+    if a == 0:
+        return 0
+    if a == 1:
+        return 1
+    return x / ((1 / a - 2) * (1 - x) + 1)
+def highpass(x, fs=44100, cutoff=3000, order=1):
+    """Butterworth highpass with doubled order because of sosfiltfilt."""
+    nyq = 0.5 * fs
+    cut = cutoff / nyq
+    sos = signal.butter(order, cut, btype='high', output='sos')
+    return signal.sosfiltfilt(sos, x)
+def lowpass(x, fs=44100, cutoff=16000, order=1):
+    """Butterworth lowpass with doubled order because of sosfiltfilt."""
+    nyq = 0.5 * fs
+    cut = cutoff / nyq
+    sos = signal.butter(order, cut, btype='low', output='sos')
+    return signal.sosfiltfilt(sos, x)
+# Pitch string interpreter
+def to_uint6(b64):
+    """Convert one Base64 character to an unsigned integer.
+    Parameters
+    ----------
+    b64 : str
+        The Base64 character.
+    Returns
+    -------
+    int
+        The equivalent of the Base64 character as an integer.
+    """
+    c = ord(b64) # Convert based on ASCII mapping
+    if c >= 97:
+        return c - 71
+    elif c >= 65:
+        return c - 65
+    elif c >= 48:
+        return c + 4
+    elif c == 43:
+        return 62
+    elif c == 47:
+        return 63
+    else:
+        raise Exception
+def to_int12(b64):
+    """Converts two Base64 characters to a signed 12-bit integer.
+    Parameters
+    ----------
+    b64 : str
+        The Base64 string.
+    Returns
+    -------
+    int
+        The equivalent of the Base64 characters as a signed 12-bit integer (-2047 to 2048)
+    """
+    uint12 = to_uint6(b64[0]) << 6 | to_uint6(b64[1]) # Combined uint6 to uint12
+    if uint12 >> 11 & 1 == 1: # Check most significant bit to simulate two's complement
+        return uint12 - 4096
+    else:
+        return uint12
+def to_int12_stream(b64):
+    """Converts a Base64 string to a list of integers.
+    Parameters
+    ----------
+    b64 : str
+        The Base64 string.
+    Returns
+    -------
+    list
+        The equivalent of the Base64 string if split every 12-bits and interpreted as a signed 12-bit integer.
+    """
+    res = []
+    for i in range(0, len(b64), 2):
+        res.append(to_int12(b64[i:i+2]))
+    return res
+def pitch_string_to_cents(x):
+    """Converts UTAU's pitchbend argument to an ndarray representing the pitch offset in cents.
+    Parameters
+    ----------
+    x : str
+        The pitchbend argument.
+    Returns
+    -------
+    ndarray
+        The pitchbend argument as pitch offset in cents.
+    """
+    pitch = x.split('#') # Split RLE Encoding
+    res = []
+    for i in range(0, len(pitch), 2):
+        # Go through each pair
+        p = pitch[i:i+2]
+        if len(p) == 2:
+            # Decode pitch string and extend RLE
+            pitch_str, rle = p
+            res.extend(to_int12_stream(pitch_str))
+            res.extend([res[-1]] * int(rle))
+        else:
+            # Decode last pitch string without RLE if it exists
+            res.extend(to_int12_stream(p[0]))
+    res = np.array(res, dtype=np.int32)
+    if np.all(res == res[0]):
+        return np.zeros(res.shape)
+    else:
+        return np.concatenate([res, np.zeros(1)])
+# Pitch conversion
+def note_to_midi(x):
+    """Note name to MIDI note number."""
+    note, octave = note_re.match(x).group(1, 2)
+    octave = int(octave) + 1
+    return octave * 12 + notes[note]
+def midi_to_hz(x):
+    """MIDI note number to Hertz using equal temperament. A4 = 440 Hz."""
+    return 440 * np.exp2((x - 69) / 12)
+##def hz_to_midi(x):
+##    return 12 * np.log2(x / 440) + 69
+# WAV read/write
+def read_wav(loc):
+    """Read audio files supported by soundfile and resample to 44.1kHz if needed. Mixes down to mono if needed.
+    Parameters
+    ----------
+    loc : str or file
+        Input audio file.
+    Returns
+    -------
+    ndarray
+        Data read from WAV file remapped to [-1, 1] and in 44.1kHz
+    """
+    if type(loc) == str: # make sure input is Path
+        loc = Path(loc)
+    exists = loc.exists()
+    if not exists: # check for alternative files
+        for ext in sf.available_formats().keys():
+            loc = loc.with_suffix('.' + ext.lower())
+            exists = loc.exists()
+            if exists:
+                break
+    if not exists:
+        raise FileNotFoundError("No supported audio file was found.")
+    x, fs = sf.read(loc)
+    if len(x.shape) == 2:
+        # Average all channels... Probably not too good for formats bigger than stereo
+        x = np.mean(x, axis=1)
+    if fs != default_fs:
+        x = resampy.resample(x, fs, default_fs)
+    return x
+def save_wav(loc, x):
+    """Save data into a WAV file.
+    Parameters
+    ----------
+    loc : str or file
+        Output WAV file.
+    x : ndarray
+        Audio data in 44.1kHz within [-1, 1].
+    Returns
+    -------
+    None
+    """
+    sf.write(loc, x, default_fs, 'PCM_16')
+# Processing WORLD things
+@njit(float64(float64[:], optional(float64), optional(float64)))
+def _jit_base_frq(f0, f0_min, f0_max):
+    q = 0
+    avg_frq = 0
+    tally = 0
+    N = len(f0)
+    if f0_min is None:
+        f0_min = f0_floor
+    if f0_max is None:
+        f0_max = f0_ceil
+    for i in range(N):
+        if f0[i] >= f0_min and f0[i] <= f0_max:
+            if i < 1:
+                q = f0[i+1] - f0[i]
+            elif i == N - 1:
+                q = f0[i] - f0[i-1]
+            else:
+                q = (f0[i+1] - f0[i-1]) / 2
+            weight = 2 ** (-q * q)
+            avg_frq += f0[i] * weight
+            tally += weight
+    if tally > 0:
+        avg_frq /= tally
+    return avg_frq
+def base_frq(f0, f0_min=None, f0_max=None):
+    """Get average F0 with a stronger bias on flatter areas.
+    Parameters
+    ----------
+    f0 : list or ndarray
+        Array of F0 values.
+    f0_min : float, optional
+        Lower F0 limit.
+    f0_max : float, optional
+        Upper F0 limit.
+    Returns
+    -------
+    float
+        Average F0.
+    """
+    return _jit_base_frq(f0, f0_min, f0_max)
+class Resampler:
+    """
+    A class for the UTAU resampling process.
+    Attributes
+    ----------
+    in_file : str
+        Path to input file.
+    out_file : str
+        Path to output file.
+    pitch : str
+        The pitch of the note.
+    velocity : str or float
+        The consonant velocity of the note.
+    flags : str
+        The flags of the note.
+    offset : str or float
+        The offset from the start for the render area of the sample.
+    length : str or int
+        The length of the stretched area in milliseconds.
+    consonant : str or float
+        The unstretched area of the render.
+    cutoff : str or float
+        The cutoff from the end or from the offset for the render area of the sample.
+    volume : str or float
+        The volume of the note in percentage.
+    modulation : str or float
+        The modulation of the note in percentage.
+    tempo : str
+        The tempo of the note.
+    pitch_string : str
+        The UTAU pitchbend parameter.
+    Methods
+    -------
+    render(self):
+        The rendering workflow. Immediately starts when class is initialized.
+    get_features(self):
+        Gets the WORLD features either from a cached file or generating it if it doesn't exist.
+    generate_features(self, features_path):
+        Generates WORLD features and saves it for later.
+    resample(self, features):
+        Renders a WAV file using the passed WORLD features.
+    """
+    def __init__(self, in_file, out_file, pitch, velocity, flags='', offset=0, length=1000, consonant=0, cutoff=0, volume=100, modulation=0, tempo='!100', pitch_string='AA'):
+        """Initializes the renderer and immediately starts it.
+        Parameters
+        ---------
+        in_file : str
+            Path to input file.
+        out_file : str
+            Path to output file.
+        pitch : str
+            The pitch of the note.
+        velocity : str or float
+            The consonant velocity of the note.
+        flags : str
+            The flags of the note.
+        offset : str or float
+            The offset from the start for the render area of the sample.
+        length : str or int
+            The length of the stretched area in milliseconds.
+        consonant : str or float
+            The unstretched area of the render.
+        cutoff : str or float
+            The cutoff from the end or from the offset for the render area of the sample.
+        volume : str or float
+            The volume of the note in percentage.
+        modulation : str or float
+            The modulation of the note in percentage.
+        tempo : str
+            The tempo of the note.
+        pitch_string : str
+            The UTAU pitchbend parameter.
+        """
+        self.in_file = Path(in_file)
+        self.out_file = out_file
+        self.pitch = note_to_midi(pitch)
+        self.velocity = float(velocity)
+        self.flags = {k : int(v) if v else None for k, v in flag_re.findall(flags.replace('/', ''))}
+        self.offset = float(offset)
+        self.length = int(length)
+        self.consonant = float(consonant)
+        self.cutoff = float(cutoff)
+        self.volume = float(volume)
+        self.modulation = float(modulation)
+        self.tempo = float(tempo[1:])
+        self.pitchbend = pitch_string_to_cents(pitch_string)
+        self.render()
+    def render(self):
+        """The rendering workflow. Immediately starts when class is initialized.
+        Parameters
+        ----------
+        None
+        """
+        features = self.get_features()
+        self.resample(features)
+    def get_features(self):
+        """Gets the WORLD features either from a cached file or generating it if it doesn't exist.
+        Parameters
+        ----------
+        None
+        Returns
+        -------
+        features : dict
+            A dictionary of the F0, MGC, BAP, and average F0.
+        """
+        # Setup cache path file
+        fname = self.in_file.name
+        features_path = self.in_file.with_suffix(cache_ext)
+        features = None
+        if 'G' in self.flags.keys():
+            logging.info('G flag exists. Forcing feature generation.')
+            features = self.generate_features(features_path)
+        elif os.path.exists(features_path):
+            # Load if it exists
+            logging.info(f'Reading {fname}{cache_ext}.')
+            features = np.load(features_path)
+        else:
+            # Generate if not
+            logging.info(f'{fname}{cache_ext} not found. Generating features.')
+            features = self.generate_features(features_path)
+        return features
+    def generate_features(self, features_path):
+        """Generates WORLD features and saves it for later.
+        Parameters
+        ----------
+        features_path : str or file
+            The path for caching the features.
+        Returns
+        -------
+        features : dict
+            A dictionary of the F0, MGC, BAP, and average F0.
+        """
+        x = read_wav(self.in_file)
+        # Check if audio is long enough
+        min_samples = int(default_fs * 0.1)  # 최소 100ms
+        if len(x) < min_samples:
+            logging.warning(f'Audio too short ({len(x)} samples < {min_samples}). Padding with zeros.')
+            # 패딩으로 최소 길이 보장
+            x = np.pad(x, (0, min_samples - len(x)), mode='constant', constant_values=0)
+        logging.info('Generating F0.')
+        f0, t = world.harvest(x, default_fs, f0_floor=f0_floor, f0_ceil=f0_ceil)
+        base_f0 = base_frq(f0)
+        logging.info('Generating spectral envelope.')
+        sp = world.cheaptrick(x, f0, t, default_fs)
+        mgc = world.code_spectral_envelope(sp, default_fs, 64)
+        logging.info('Generating aperiodicity.')
+        ap = world.d4c(x, f0, t, default_fs, threshold=0.25)
+        bap = world.code_aperiodicity(ap, default_fs)
+        logging.info('Saving features.')
+        features = {'base' : base_f0, 'f0' : f0, 'mgc' : mgc, 'bap' : bap}
+        np.savez_compressed(features_path, **features)
+        return features
+    def resample(self, features):
+        """Renders a WAV file using the passed WORLD features.
+        Parameters
+        ----------
+        features : dict
+            A dictionary of the F0, MGC, BAP, and average F0.
+        Returns
+        -------
+        None
+        """
+        if self.out_file == 'nul':
+            logging.info('Null output file. Skipping...')
+            return
+        self.out_file = Path(self.out_file)
+        # Convert percentages to decimal
+        vel = np.exp2(1 - self.velocity / 100) # convel is more a multiplier...
+        vol = self.volume / 100
+        mod = self.modulation / 100
+        logging.info('Decoding WORLD features.')
+        # Recalculate spectral envelope and aperiodicity
+        sp = world.decode_spectral_envelope(features['mgc'], default_fs, fft_size)
+        ap = world.decode_aperiodicity(features['bap'], default_fs, fft_size)
+        # Turn F0 to offset map for modulation
+        base_f0 = features['base']
+        f0 = features['f0']
+        f0[f0 == 0] = base_f0
+        f0_off = f0 - base_f0
+        # Calculate temporal positions
+        t_area = np.arange(len(f0)) * 0.005
+        logging.info('Calculating timing.') # use seconds instead of 5ms terms cuz someone gave me negative offsets </3
+        start = self.offset / 1000 # start time
+        end = self.cutoff / 1000 # end time
+        if self.cutoff < 0: # deal with relative end time
+            end = start - end
+        else:
+            end = t_area[-1] - end
+        con = start + self.consonant / 1000 # consonant
+        logging.info('Preparing interpolators.')
+        # Check if we have enough data points for interpolation
+        if len(t_area) < 2 or len(f0_off) < 2:
+            logging.error(f'Insufficient data for interpolation: t_area={len(t_area)}, f0_off={len(f0_off)}')
+            # Create a minimal valid signal
+            if len(t_area) < 2:
+                t_area = np.array([0.0, 0.01])  # 10ms minimum
+            if len(f0_off) < 2:
+                f0_off = np.array([0.0, 0.0])
+            if len(sp) < 2:
+                sp = np.repeat(sp[:1], 2, axis=0) if len(sp) == 1 else np.zeros((2, fft_size//2+1))
+            if len(ap) < 2:
+                ap = np.repeat(ap[:1], 2, axis=0) if len(ap) == 1 else np.zeros((2, fft_size//2+1))
+        # Make interpolators to render new areas
+        f0_off_interp = interp.UnivariateSpline(t_area, f0_off, s=0, ext='const')
+        sp_interp = interp.Akima1DInterpolator(t_area, sp)
+        ap_interp = interp.Akima1DInterpolator(t_area, ap)
+        # Make new temporal positions array for stretching
+        t_consonant = np.linspace(start, con, num=int(vel * self.consonant / 5), endpoint=False) # temporal positions of the unstretched area. can be stretched because of velocity
+        # stretched area only needs to stretch if the length required is longer than the stretch area
+        length_req = self.length / 1000
+        stretch_length = end - con
+        if stretch_length > length_req:
+            con_idx = int(200 * con) # position of consonant in the temporal positions array ??
+            len_idx = int(200 * length_req) # length of length required by 5ms frames
+            t_stretch = t_area[con_idx:con_idx+len_idx]
+        else:
+            t_stretch = np.linspace(con, end, num=int(200 * length_req))
+        t_render = clip(np.concatenate([t_consonant, t_stretch]), 0, t_area[-1]) # concatenate and clip for interpolation
+        con = len(t_consonant) # new placement of the consonant, now in 5ms frame terms...
+        logging.info('Interpolating WORLD features.')
+        # Interpolate render area
+        f0_off_render = f0_off_interp(t_render)
+        sp_render = sp_interp(t_render)
+        ap_render = clip(ap_interp(t_render), 0, 1) # aperiodicity freaks out if not within [0, 1] range
+        # Calculate new temporal positions for tuning
+        t = np.arange(len(sp_render)) * 0.005
+        logging.info('Calculating pitch.')
+        # Calculate pitch in MIDI note number terms
+        pitch = self.pitchbend / 100 + self.pitch
+        t_pitch = 60 * np.arange(len(pitch)) / (self.tempo * 96)
+        # Check if we have enough pitch data points
+        if len(pitch) < 2 or len(t_pitch) < 2:
+            logging.warning(f'Insufficient pitch data: len(pitch)={len(pitch)}, len(t_pitch)={len(t_pitch)}')
+            # Create minimal pitch data
+            if len(pitch) < 2:
+                pitch = np.array([self.pitch, self.pitch])
+            if len(t_pitch) < 2:
+                t_pitch = np.array([0.0, 0.01])
+        pitch_interp = interp.Akima1DInterpolator(t_pitch, pitch)
+        pitch_render = pitch_interp(clip(t, 0, t_pitch[-1]))
+        logging.info('Checking flags.')
+        # Flag interpretation area
+        ### BEFORE HZ CONVERSION FLAGS ###
+        # Pitch offset flag
+        if 't' in self.flags.keys():
+            pitch_render += self.flags['t'] / 100
+        # Convert pitch to Hertz and add F0 offset for modulation
+        f0_render = midi_to_hz(pitch_render) + f0_off_render * mod
+        ### BEFORE RENDER FLAGS ###
+        # Vocal Fry flag
+        if 'fe' in self.flags.keys():
+            logging.info('Adding vocal fry.')
+            fry = self.flags['fe'] / 1000
+            fry_len = 0.075
+            fry_offset = 0
+            fry_pitch = f0_floor
+            if 'fl' in self.flags.keys(): # check length flag
+                fry_len = max(self.flags['fl'] / 1000, 0.001)
+            if 'fo' in self.flags.keys():
+                fry_offset = self.flags['fo'] / 1000
+            if 'fp' in self.flags.keys():
+                fry_pitch = max(self.flags['fp'], 0)
+            # Prepare envelope
+            t_fry = t - t[con] - fry_offset # temporal positions centered around the consonant shifted by offset
+            amt = smoothstep(-fry - fry_len / 2, -fry + fry_len / 2, t_fry) * smoothstep(fry_len / 2, -fry_len / 2, t_fry) #fry envelope
+            f0_render = f0_render * (1 - amt) + fry_pitch * amt # mix low F0 for fry
+        # Gender/Formant shift flag
+        if 'g' in self.flags.keys():
+            logging.info('Shifting formants.')
+            gender = np.exp2(self.flags['g'] / 120)
+            freq_x = np.linspace(0, 1, fft_size // 2 + 1) # map spectral envelope by frequency instead of time
+            sp_render_interp = interp.Akima1DInterpolator(freq_x, sp_render, axis=1)
+            # stretch spectral envelope depending on gender
+            freq_x = clip(np.linspace(0, gender, fft_size // 2 + 1), 0, 1) # clip axis because Akima1DInterpolator doesn't extrapolate (or even just extend)
+            sp_render = sp_render_interp(freq_x).copy(order='C')
+        # map unvoicedness (kinda like voisona huskiness)
+        husk = np.mean(ap_render, axis=1)
+        # Breathiness flag
+        if 'B' in self.flags.keys():
+            breath = self.flags['B']
+            if breath <= 50: # Raise power to flatten smaller areas and keep max aperiodicity
+                logging.info('Lowering breathiness.')
+                breath = breath / 100
+                ap_render = bias(ap_render, breath)
+                ap_render[np.isclose(husk, 1),:] = 1 # make sure unvoiced areas stay unvoiced... only happens if breathiness is 0 but too much if statements
+        else:
+            breath = 0
+        # Distortion flag
+        if 'D' in self.flags.keys():
+            logging.info('Adding distortion.')
+            distortion_amount = clip(self.flags['D'], 0, 100)
+            ap_render = ap_render * (distortion_amount / 10)
+            f0_render = f0_render + np.random.normal(0, distortion_amount, len(f0_render))
+        # Coarsness flag
+        if 'C' in self.flags.keys():
+            logging.info('Adding coarseness.')
+            coarseness = clip(self.flags['C'], 0, 100)
+            for i in range(len(f0_render)):
+                if i % 6 == 0:
+                    f0_render[i] = 60
+        #Peak compressor flag
+        flag_peak = self.flags.get('P', 86)
+        peak = 1 - flag_peak / 100
+        if flag_peak > 0:
+            rms = np.sqrt(2 * np.sum(sp_render, axis=1) / fft_size ** 2 + 0.000001) # get RMS.. i'm not sure if this is right but i think it's fine
+            rms_peak = np.max(rms)
+            rms_norm = rms / (peak * rms_peak)
+            comp = np.zeros(rms_norm.shape)
+            comp[rms_norm >= 1] = rms_norm[rms_norm >= 1] - 1
+            comp = (1 - peak) * comp / np.max(comp)
+            comp = 1 - comp
+            comp = ndimage.gaussian_filter1d(comp, 6)
+            comp = np.vstack([comp] * sp_render.shape[1]).transpose()
+            sp_render *= comp
+            ap_render *= comp
+        # remove pitch in areas with max aperiodicity
+        f0_render[np.isclose(husk, 1)] = 0
+        render = world.synthesize(f0_render, sp_render, ap_render, default_fs)
+        ### AFTER RENDER FLAGS ###
+        # Max aperiodicity flag
+        if 'S' in self.flags.keys():
+            amt = clip(self.flags['S'] / 100, 0, 1)
+            render_ap = world.synthesize(f0_render, sp_render, np.ones(ap_render.shape), default_fs)
+            render = render * (1 - amt) + render_ap * amt
+        if breath > 50: # mix max breathiness signal
+            logging.info('Raising breathiness.')
+            breath = clip((breath - 50) / 50, 0, 1)
+            render_breath = world.synthesize(f0_render, sp_render * np.square(ap_render), np.ones(ap_render.shape), default_fs) # apply band AP on regular specgram, max out ap
+            render = render * (1 - breath) + render_breath * breath # Mix signals
+        t_sample = np.arange(len(render)) / default_fs # temporal position per sample
+        if 'fe' in self.flags.keys():
+            fry = self.flags['fe'] / 1000
+            fry_len = 0.05
+            fry_offset = 0
+            fry_vol = 0.1
+            if 'fl' in self.flags.keys(): # check length flag
+                fry_len = max(self.flags['fl'] / 1000, 0.001)
+            if 'fo' in self.flags.keys():
+                fry_offset = self.flags['fo'] / 1000
+            if 'fv' in self.flags.keys():
+                fry_vol = clip(self.flags['fv'] / 100, 0, 1)
+            # Prepare envelope
+            t_fry = t_sample - t[con] - fry_offset # temporal positions centered around the consonant shifted by offset
+            amt = smoothstep(-fry - fry_len / 2, -fry + fry_len / 2, t_fry) * smoothstep(fry_len / 2, -fry_len / 2, t_fry) #fry envelope
+            env = 1 - amt + fry_vol * amt
+            render_hp = highpass(render, cutoff=300) # add a highpass through the fry area
+            render = render * (1 - amt) + render_hp * amt
+            render *= env
+        # Fix voicing flag
+        if 've' in self.flags.keys():
+            logging.info('Fixing voicing.')
+            end_breath = self.flags['ve'] / 1000
+            render_breath = world.synthesize(f0_render, sp_render * np.square(ap_render), np.ones(ap_render.shape), default_fs) # apply band AP on regular specgram, max out ap
+            offset = 0
+            if 'vo' in self.flags.keys(): # check offset flag
+                offset = self.flags['vo'] / 1000
+                logging.info(offset)
+            amt = smoothstep(-end_breath / 2, end_breath / 2, t_sample - t[con] - offset) # smoothstep with consonant at 0.5
+            render = render * (1 - amt) + render_breath * amt # mix sample based on envelope
+        normalize = self.flags.get('p', 6)
+        if normalize >= 0:
+            normal = render / np.max(render)
+            render = normal * (10 ** (-normalize / 20))
+        ### AFTER PEAK NORMALIZATION ###
+        # Tremolo flag
+        if 'A' in self.flags.keys():
+            logging.info('Adding tremolo.')
+            tremolo = self.flags['A'] / 100
+            pitch_sample = pitch_interp(clip(t_sample, 0, t_pitch[-1])) # probably bad because of how low the sampling rate is for the pitch
+            pitch_smooth = lowpass(pitch_sample, cutoff=8, order=16)
+            vibrato = highpass(pitch_smooth, cutoff=4, order=16)
+            amt = np.maximum(tremolo * vibrato + 1, 0)
+            render = render * amt
+        # Growl flag
+        if 'R' in self.flags.keys():
+            logging.info('Adding tremolo growl flag.')
+            depth = clip(self.flags['R'] / 100, 0, 1)
+            rate = 75
+            time = np.arange(len(render)) / default_fs
+            sine_wave = np.sin(2 * np.pi * rate * time)
+            render = render * (2 - depth * sine_wave) / 2
+        render *= vol # volume
+        save_wav(self.out_file, render)
+if __name__ == '__main__':
+    logging.info(f'straycat {version}')
+    try:
+        Resampler(*sys.argv[1:])
+    except Exception as e:
+        name = e.__class__.__name__
+        if name == 'TypeError':
+            logging.info(help_string)
+        else:
+            raise e

test_compressed_voicebank.py ADDED Viewed

	@@ -0,0 +1,158 @@

+#!/usr/bin/env python3
+"""
+압축된 보이스뱅크 테스트 스크립트
+압축 전후의 성능과 정확성을 비교합니다.
+"""
+import time
+import numpy as np
+from pathlib import Path
+from utau_engine import UTAUEngine
+from compressed_utau_engine import CompressedUTAUEngine
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def test_voicebank_comparison():
+    """원본과 압축된 보이스뱅크 비교 테스트"""
+    # 경로 설정
+    original_path = "voice/hanseol CVC"
+    compressed_path = "voice/hanseol_CVC_compressed.h5"
+    print("🔍 보이스뱅크 비교 테스트 시작")
+    print("=" * 50)
+    # 압축된 보이스뱅크가 없으면 종료
+    if not Path(compressed_path).exists():
+        print("❌ 압축된 보이스뱅크를 찾을 수 없습니다.")
+        print("먼저 voice_data_converter.py를 실행하세요.")
+        return
+    # 1. 로딩 시간 비교
+    print("\n📊 1. 로딩 시간 비교")
+    # 원본 로딩
+    if Path(original_path).exists():
+        start_time = time.time()
+        try:
+            original_engine = UTAUEngine(original_path)
+            original_load_time = time.time() - start_time
+            print(f"   원본 보이스뱅크 로딩: {original_load_time:.2f}초")
+        except Exception as e:
+            print(f"   원본 로딩 실패: {e}")
+            original_engine = None
+            original_load_time = float('inf')
+    else:
+        print("   원본 보이스뱅크 없음")
+        original_engine = None
+        original_load_time = float('inf')
+    # 압축된 버전 로딩
+    start_time = time.time()
+    try:
+        compressed_engine = CompressedUTAUEngine(compressed_path)
+        compressed_load_time = time.time() - start_time
+        print(f"   압축된 보이스뱅크 로딩: {compressed_load_time:.2f}초")
+        if original_load_time != float('inf'):
+            speedup = original_load_time / compressed_load_time
+            print(f"   로딩 속도 개선: {speedup:.1f}배")
+    except Exception as e:
+        print(f"   압축된 버전 로딩 실패: {e}")
+        return
+    # 2. 메모리 사용량 및 압축 정보
+    print("\n📊 2. 압축 정보")
+    compression_info = compressed_engine.get_compression_info()
+    print(f"   원본 크기: {compression_info['original_size_mb']:.1f} MB")
+    print(f"   압축 크기: {compression_info['compressed_size_mb']:.1f} MB")
+    print(f"   압축율: {compression_info['compression_ratio']:.1f}%")
+    # 3. 음소 수 비교
+    print("\n📊 3. 음소 정보")
+    compressed_phonemes = compressed_engine.get_available_phonemes()
+    print(f"   압축된 버전 음소 수: {len(compressed_phonemes)}개")
+    if original_engine:
+        original_phonemes = original_engine.get_available_phonemes()
+        print(f"   원본 음소 수: {len(original_phonemes)}개")
+        # 음소 일치도 확인
+        original_set = set(original_phonemes)
+        compressed_set = set(compressed_phonemes)
+        match_rate = len(original_set & compressed_set) / len(original_set) * 100
+        print(f"   음소 일치도: {match_rate:.1f}%")
+    # 4. 합성 테스트
+    print("\n📊 4. 합성 성능 테스트")
+    # 테스트용 노트 시퀀스
+    test_notes = [
+        {
+            "pitch": 60,  # C4
+            "startSeconds": 0.0,
+            "durationSeconds": 0.5,
+            "endSeconds": 0.5,
+            "velocity": 100
+        },
+        {
+            "pitch": 64,  # E4
+            "startSeconds": 0.5,
+            "durationSeconds": 0.5,
+            "endSeconds": 1.0,
+            "velocity": 100
+        },
+        {
+            "pitch": 67,  # G4
+            "startSeconds": 1.0,
+            "durationSeconds": 0.5,
+            "endSeconds": 1.5,
+            "velocity": 100
+        }
+    ]
+    test_lyrics = ["도", "미", "솔"]
+    # 압축된 버전 합성 테스트
+    start_time = time.time()
+    compressed_result, compressed_status = compressed_engine.synthesize_sequence(
+        test_notes, test_lyrics, tempo=120, volume=100
+    )
+    compressed_synth_time = time.time() - start_time
+    print(f"   압축된 버전 합성 시간: {compressed_synth_time:.2f}초")
+    print(f"   압축된 버전 상태: {compressed_status}")
+    # 원본 합성 테스트 (있을 경우)
+    if original_engine:
+        start_time = time.time()
+        original_result, original_status = original_engine.synthesize_sequence(
+            test_notes, test_lyrics, tempo=120, volume=100
+        )
+        original_synth_time = time.time() - start_time
+        print(f"   원본 합성 시간: {original_synth_time:.2f}초")
+        print(f"   원본 상태: {original_status}")
+        if original_synth_time > 0:
+            speedup = original_synth_time / compressed_synth_time
+            print(f"   합성 속도 개선: {speedup:.1f}배")
+    # 5. 권장사항
+    print("\n💡 5. 권장사항")
+    print("   ✅ HDF5 압축 방식의 장점:")
+    print("      - 단일 파일로 관리 용이")
+    print("      - 높은 압축율로 저장공간 절약")
+    print("      - 빠른 랜덤 액세스")
+    print("      - Hugging Face Spaces 최적화")
+    print("   ✅ Gradio/HF Spaces 배포 시:")
+    print("      - 압축된 .h5 파일만 업로드")
+    print("      - 원본 WAV 파일들은 제외")
+    print("      - 빠른 앱 시작 시간")
+    print("      - 낮은 스토리지 비용")
+if __name__ == "__main__":
+    test_voicebank_comparison()

utau_engine.py ADDED Viewed

	@@ -0,0 +1,467 @@

+import os
+import re
+import logging
+import tempfile
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+from dataclasses import dataclass
+from straycat import Resampler
+# 로깅 설정
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@dataclass
+class OtoEntry:
+    """UTAU oto.ini 엔트리 클래스"""
+    filename: str          # WAV 파일명
+    alias: str            # 별명 (발음)
+    offset: float         # 오프셋 (ms)
+    consonant: float      # 자음 길이 (ms)
+    cutoff: float         # 컷오프 (ms)
+    preutterance: float   # 프리유터런스 (ms)
+    overlap: float        # 오버랩 (ms)
+    @property
+    def is_breath(self) -> bool:
+        """숨소리/무음 구간인지 확인"""
+        return self.alias.startswith('-') or self.alias.startswith('*')
+    @property
+    def clean_alias(self) -> str:
+        """접두사 제거된 순수 별명"""
+        alias = self.alias
+        if alias.startswith('- '):
+            return alias[2:]
+        elif alias.startswith('* '):
+            return alias[2:]
+        elif alias.startswith('-'):
+            return alias[1:]
+        elif alias.startswith('*'):
+            return alias[1:]
+        return alias
+class VoicebankManager:
+    """UTAU 보이스뱅크 관리 클래스"""
+    def __init__(self, voicebank_path: Union[str, Path]):
+        self.voicebank_path = Path(voicebank_path)
+        self.oto_entries: Dict[str, OtoEntry] = {}
+        self.wav_files: Dict[str, Path] = {}
+        self.load_voicebank()
+    def load_voicebank(self):
+        """보이스뱅크 로드"""
+        if not self.voicebank_path.exists():
+            raise FileNotFoundError(f"보이스뱅크 경로를 찾을 수 없습니다: {self.voicebank_path}")
+        # oto.ini 파일 찾기
+        oto_file = self.voicebank_path / "oto.ini"
+        if not oto_file.exists():
+            raise FileNotFoundError(f"oto.ini 파일을 찾을 수 없습니다: {oto_file}")
+        # WAV 파일들 인덱싱
+        self._index_wav_files()
+        # oto.ini 파싱
+        self._parse_oto_ini(oto_file)
+        logger.info(f"보이스뱅크 로드 완료: {len(self.oto_entries)}개 엔트리, {len(self.wav_files)}개 WAV 파일")
+    def _index_wav_files(self):
+        """WAV 파일들 인덱싱"""
+        for wav_file in self.voicebank_path.glob("*.wav"):
+            self.wav_files[wav_file.name] = wav_file
+        # 하위 폴더도 검색
+        for subfolder in self.voicebank_path.iterdir():
+            if subfolder.is_dir():
+                for wav_file in subfolder.glob("*.wav"):
+                    self.wav_files[wav_file.name] = wav_file
+    def _parse_oto_ini(self, oto_file: Path):
+        """oto.ini 파일 파싱"""
+        try:
+            # 다양한 인코딩으로 시도
+            encodings = ['shift_jis', 'utf-8', 'cp932', 'euc-jp']
+            content = None
+            for encoding in encodings:
+                try:
+                    with open(oto_file, 'r', encoding=encoding) as f:
+                        content = f.read()
+                    logger.info(f"oto.ini를 {encoding} 인코딩으로 읽었습니다.")
+                    break
+                except UnicodeDecodeError:
+                    continue
+            if content is None:
+                raise Exception("oto.ini 파일을 읽을 수 없습니다. 인코딩 문제가 있을 수 있습니다.")
+            # 각 라인 파싱
+            for line_num, line in enumerate(content.strip().split('\n'), 1):
+                line = line.strip()
+                if not line or line.startswith('#'):
+                    continue
+                try:
+                    self._parse_oto_line(line)
+                except Exception as e:
+                    logger.warning(f"oto.ini {line_num}번째 줄 파싱 실패: {e}")
+                    continue
+        except Exception as e:
+            logger.error(f"oto.ini 파싱 실패: {e}")
+            raise
+    def _parse_oto_line(self, line: str):
+        """oto.ini 한 줄 파싱"""
+        # 형식: filename=alias,offset,consonant,cutoff,preutterance,overlap
+        if '=' not in line:
+            return
+        filename, params = line.split('=', 1)
+        parts = params.split(',')
+        if len(parts) != 6:
+            logger.warning(f"잘못된 oto.ini 형식: {line}")
+            return
+        try:
+            alias = parts[0]
+            offset = float(parts[1])
+            consonant = float(parts[2])
+            cutoff = float(parts[3])
+            preutterance = float(parts[4])
+            overlap = float(parts[5])
+            entry = OtoEntry(
+                filename=filename,
+                alias=alias,
+                offset=offset,
+                consonant=consonant,
+                cutoff=cutoff,
+                preutterance=preutterance,
+                overlap=overlap
+            )
+            self.oto_entries[alias] = entry
+        except ValueError as e:
+            logger.warning(f"oto.ini 파라미터 파싱 실패: {line} - {e}")
+    def get_sample_for_phoneme(self, phoneme: str) -> Optional[OtoEntry]:
+        """음소에 해당하는 샘플 찾기"""
+        # 정확한 매치 먼저 시도
+        if phoneme in self.oto_entries:
+            return self.oto_entries[phoneme]
+        # 유사한 발음 찾기
+        candidates = []
+        for alias in self.oto_entries:
+            entry = self.oto_entries[alias]
+            if entry.clean_alias == phoneme:
+                candidates.append(entry)
+        if candidates:
+            # 숨소리가 아닌 것을 우선
+            non_breath = [c for c in candidates if not c.is_breath]
+            return non_breath[0] if non_breath else candidates[0]
+        return None
+    def get_wav_path(self, filename: str) -> Optional[Path]:
+        """WAV 파일 경로 가져오기"""
+        return self.wav_files.get(filename)
+    def list_available_phonemes(self) -> List[str]:
+        """사용 가능한 음소 목록"""
+        return list(set(entry.clean_alias for entry in self.oto_entries.values()))
+class UTAUEngine:
+    """UTAU 호환 음성 합성 엔진"""
+    def __init__(self, voicebank_path: Union[str, Path]):
+        self.voicebank = VoicebankManager(voicebank_path)
+        self.default_phoneme = "あ"  # 기본 음소
+    def synthesize_sequence(self,
+                          notes: List[Dict],
+                          lyrics: List[str],
+                          tempo: int = 120,
+                          volume: int = 100) -> Tuple[Optional[str], str]:
+        """노트 시퀀스와 가사로 음성 합성"""
+        if len(notes) != len(lyrics):
+            return None, "노트와 가사의 개수가 일치하지 않습니다."
+        if not notes:
+            return None, "노트가 없습니다."
+        try:
+            # 전체 길이 계산 - 초 단위로 계산
+            max_end_time_seconds = max(note.get('endSeconds', note.get('startSeconds', 0) + note.get('durationSeconds', 0.5)) for note in notes)
+            max_end_time = max_end_time_seconds * 1000  # 밀리초로 변환
+            sample_rate = 44100
+            total_samples = int(max_end_time * sample_rate / 1000) + sample_rate
+            final_audio = np.zeros(total_samples)
+            synthesized_count = 0
+            for i, (note, lyric) in enumerate(zip(notes, lyrics)):
+                try:
+                    # 음소로 변환 (간단한 일본어 음소 매핑)
+                    phoneme = self._lyric_to_phoneme(lyric)
+                    # 보이스뱅크에서 샘플 찾기
+                    oto_entry = self.voicebank.get_sample_for_phoneme(phoneme)
+                    if not oto_entry:
+                        logger.warning(f"음소 '{phoneme}' (가사: '{lyric}')에 해당하는 샘플을 찾을 수 없습니다.")
+                        continue
+                    # WAV 파일 경로
+                    wav_path = self.voicebank.get_wav_path(oto_entry.filename)
+                    if not wav_path or not wav_path.exists():
+                        logger.warning(f"WAV 파일을 찾을 수 없습니다: {oto_entry.filename}")
+                        continue
+                    # 음성 합성
+                    synth_audio = self._synthesize_note(note, oto_entry, wav_path, tempo, volume)
+                    if synth_audio is not None:
+                        # 오디오 믹싱
+                        start_sample = int(note.get('startSeconds', 0) * sample_rate)  # 초 단위를 샘플로 변환
+                        end_sample = start_sample + len(synth_audio)
+                        if end_sample <= len(final_audio):
+                            final_audio[start_sample:end_sample] += synth_audio
+                        else:
+                            # 버퍼 확장
+                            new_size = end_sample + sample_rate
+                            new_final_audio = np.zeros(new_size)
+                            new_final_audio[:len(final_audio)] = final_audio
+                            new_final_audio[start_sample:end_sample] += synth_audio
+                            final_audio = new_final_audio
+                        synthesized_count += 1
+                        logger.info(f"노트 {i+1} 합성 완료: {lyric} -> {phoneme}")
+                except Exception as e:
+                    logger.error(f"노트 {i+1} 합성 실패: {e}")
+                    continue
+            if synthesized_count == 0:
+                return None, "합성된 노트가 없습니다."
+            # 최종 오디오 정규화
+            if np.max(np.abs(final_audio)) > 0:
+                final_audio = final_audio / np.max(np.abs(final_audio)) * 0.8
+            # 파일 저장
+            output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+            sf.write(output_file.name, final_audio, sample_rate)
+            output_file.close()
+            duration_sec = len(final_audio) / sample_rate
+            return output_file.name, f"✅ UTAU 합성 완료: {synthesized_count}/{len(notes)}개 노트, {duration_sec:.1f}초"
+        except Exception as e:
+            error_msg = f"❌ UTAU 합성 중 오류: {str(e)}"
+            logger.error(error_msg)
+            return None, error_msg
+    def _lyric_to_phoneme(self, lyric: str) -> str:
+        """가사를 음소로 변환 (한국어 + 일본어 매핑)"""
+        # 공백 제거
+        lyric = lyric.strip()
+        # 빈 가사면 기본값 반환
+        if not lyric:
+            return self.default_phoneme
+        # 한국어 음소 매핑 (hanseol CVC용)
+        korean_map = {
+            # 기본 모음
+            '아': 'a', '이': 'i', '우': 'u', '에': 'e', '오': 'o', '으': 'eu', '어': 'eo',
+            # 기본 자음+모음
+            '바': 'ba', '비': 'bi', '부': 'bu', '베': 'be', '보': 'bo', '브': 'beu', '버': 'beo',
+            '다': 'da', '디': 'di', '두': 'du', '데': 'de', '도': 'do', '드': 'deu', '더': 'deo',
+            '가': 'ga', '기': 'gi', '구': 'gu', '게': 'ge', '고': 'go', '그': 'geu', '거': 'geo',
+            '하': 'ha', '히': 'hi', '후': 'hu', '헤': 'he', '호': 'ho', '흐': 'heu', '허': 'heo',
+            '자': 'ja', '지': 'ji', '주': 'ju', '제': 'je', '조': 'jo', '즈': 'jeu', '저': 'jeo',
+            '카': 'ka', '키': 'ki', '쿠': 'ku', '케': 'ke', '코': 'ko', '크': 'keu', '커': 'keo',
+            '라': 'la', '리': 'li', '루': 'lu', '레': 'le', '로': 'lo', '르': 'leu', '러': 'leo',
+            '마': 'ma', '미': 'mi', '무': 'mu', '메': 'me', '모': 'mo', '므': 'meu', '머': 'meo',
+            '나': 'na', '니': 'ni', '누': 'nu', '네': 'ne', '노': 'no', '느': 'neu', '너': 'neo',
+            '파': 'pa', '피': 'pi', '푸': 'pu', '페': 'pe', '포': 'po', '프': 'peu', '퍼': 'peo',
+            '라': 'ra', '리': 'ri', '루': 'ru', '레': 're', '로': 'ro', '르': 'reu', '러': 'reo',
+            '사': 'sa', '시': 'si', '수': 'su', '세': 'se', '소': 'so', '스': 'seu', '서': 'seo',
+            '타': 'ta', '티': 'ti', '투': 'tu', '테': 'te', '토': 'to', '트': 'teu', '터': 'teo',
+            # 복합모음
+            '야': 'ya', '예': 'ye', '여': 'yeo', '요': 'yo', '유': 'yu', '의': 'eui',
+            '와': 'wa', '웨': 'we', '위': 'wi', '워': 'weo',
+            # 기타 한국어 도레미
+            '도': 'do', '레': 're', '미': 'mi', '파': 'fa', '솔': 'so', '라': 'la', '시': 'si'
+        }
+        # 한국어 매핑 시도
+        if lyric in korean_map:
+            return korean_map[lyric]
+        # 로마자 -> 히라가나 변환 (일본어)
+        romaji_map = {
+            'a': 'あ', 'i': 'い', 'u': 'う', 'e': 'え', 'o': 'お',
+            'ka': 'か', 'ki': 'き', 'ku': 'く', 'ke': 'け', 'ko': 'こ',
+            'sa': 'さ', 'shi': 'し', 'su': 'す', 'se': 'せ', 'so': 'そ',
+            'ta': 'た', 'chi': 'ち', 'tsu': 'つ', 'te': 'て', 'to': 'と',
+            'na': 'な', 'ni': 'に', 'nu': 'ぬ', 'ne': 'ね', 'no': 'の',
+            'ha': 'は', 'hi': 'ひ', 'fu': 'ふ', 'he': 'へ', 'ho': 'ほ',
+            'ma': 'ま', 'mi': 'み', 'mu': 'む', 'me': 'め', 'mo': 'も',
+            'ya': 'や', 'yu': 'ゆ', 'yo': 'よ',
+            'ra': 'ら', 'ri': 'り', 'ru': 'る', 're': 'れ', 'ro': 'ろ',
+            'wa': 'わ', 'wo': 'を', 'n': 'ん'
+        }
+        # 로마자 변환 시도
+        lyric_lower = lyric.lower()
+        if lyric_lower in romaji_map:
+            return romaji_map[lyric_lower]
+        # 이미 음소인 경우 (hanseol CVC 직접 입력)
+        available_phonemes = self.voicebank.list_available_phonemes()
+        if lyric in available_phonemes:
+            return lyric
+        # 기본값 반환
+        logger.warning(f"알 수 없는 가사: '{lyric}', 기본 음소 '{self.default_phoneme}' 사용")
+        return self.default_phoneme
+    def _synthesize_note(self,
+                        note: Dict,
+                        oto_entry: OtoEntry,
+                        wav_path: Path,
+                        tempo: int,
+                        volume: int) -> Optional[np.ndarray]:
+        """개별 노트 합성"""
+        try:
+            # MIDI 노트를 노트 이름으로 변환
+            note_name = self._midi_to_note_name(note['pitch'])
+            # 노트 길이 검증 및 조정
+            min_duration = 200  # 최소 200ms
+            duration = max(note.get('durationSeconds', 0.5) * 1000, min_duration)  # 초를 밀리초로 변환
+            # 임시 출력 파일
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
+                temp_output = temp_file.name
+            try:
+                # UTAU 파라미터 검증 및 조정
+                offset = max(oto_entry.offset, 0)  # 음수 오프셋 방지
+                consonant = max(oto_entry.consonant if oto_entry.consonant > 0 else 50, 10)  # 최소 10ms
+                cutoff = max(oto_entry.cutoff if oto_entry.cutoff > 0 else 0, 0)
+                # WAV 파일 길이 확인
+                try:
+                    info = sf.info(wav_path)
+                    wav_duration_ms = (info.frames / info.samplerate) * 1000
+                    # 오프셋이 WAV 파일보다 긴 경우 조정
+                    if offset >= wav_duration_ms - 100:  # 100ms 여유
+                        offset = max(0, wav_duration_ms - 200)
+                        logger.warning(f"오프셋이 너무 큽니다. {offset}ms로 조정했습니다.")
+                    # 자음 길이가 너무 긴 경우 조정
+                    max_consonant = min(duration / 2, wav_duration_ms - offset - 50)
+                    consonant = min(consonant, max_consonant)
+                except Exception as e:
+                    logger.warning(f"WAV 파일 정보 확인 실패: {e}")
+                # 최소 길이 보장
+                if consonant < 10:
+                    consonant = 10
+                if duration < consonant + 50:
+                    duration = consonant + 50
+                logger.info(f"합성 파라미터: offset={offset:.1f}ms, consonant={consonant:.1f}ms, duration={duration:.1f}ms")
+                # straycat으로 합성
+                resampler = Resampler(
+                    in_file=str(wav_path),
+                    out_file=temp_output,
+                    pitch=note_name,
+                    velocity=note.get('velocity', 100),
+                    length=duration,
+                    volume=volume,
+                    flags='',
+                    offset=offset,
+                    consonant=consonant,
+                    cutoff=cutoff,
+                    modulation=0,
+                    tempo=f'!{tempo}'
+                )
+                # 합성된 오디오 로드
+                if os.path.exists(temp_output):
+                    synth_audio, _ = sf.read(temp_output)
+                    # 결과 검증
+                    if len(synth_audio) == 0:
+                        logger.warning("합성된 오디오가 비어있습니다.")
+                        return None
+                    return synth_audio
+                else:
+                    logger.warning("합성 결과 파일이 생성되지 않았습니다.")
+                    return None
+            except Exception as e:
+                logger.error(f"straycat 합성 실패: {e}")
+                return None
+            finally:
+                # 임시 파일 정리
+                if os.path.exists(temp_output):
+                    try:
+                        os.unlink(temp_output)
+                    except:
+                        pass
+        except Exception as e:
+            logger.error(f"노트 합성 실패: {e}")
+            return None
+        return None
+    def _midi_to_note_name(self, midi_note: int) -> str:
+        """MIDI 노트를 노트 이름으로 변환"""
+        notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
+        octave = (midi_note // 12) - 1
+        note = notes[midi_note % 12]
+        return f"{note}{octave}"
+    def get_available_phonemes(self) -> List[str]:
+        """사용 가능한 음소 목록 반환"""
+        return self.voicebank.list_available_phonemes()
+# 테스트 함수
+def test_utau_engine():
+    """UTAU 엔진 테스트"""
+    try:
+        voicebank_path = "voice/hanseol CVC"
+        engine = UTAUEngine(voicebank_path)
+        print(f"hanseol CVC 보이스뱅크 로드 완료!")
+        print(f"사용 가능한 음소: {len(engine.get_available_phonemes())}개")
+        print(f"첫 10개 음소: {engine.get_available_phonemes()[:10]}")
+        return engine
+    except Exception as e:
+        print(f"UTAU 엔진 테스트 실패: {e}")
+        return None
+if __name__ == "__main__":
+    test_utau_engine()

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

voice/hanseol_CVC_compressed.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1c9292e72186f3701d906307e393bb8ca6b0342cde6e883caae513daf2eff61
+size 35548060

voice/test_voice.sc.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dd4f9cdc03422f3febdf9787cb6f41592b51b11ff9ced53e7baf67e844b5858
+size 108682

voice/test_voice.wav ADDED Viewed

Binary file (88.2 kB). View file

voice_data_converter.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import h5py
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+import logging
+import json
+import gzip
+from typing import Dict, List, Optional, Tuple, Union
+import shutil
+from utau_engine import VoicebankManager, OtoEntry
+logger = logging.getLogger(__name__)
+class VoiceDataCompressor:
+    """음성 데이터를 HDF5 형태로 압축/관리하는 클래스"""
+    def __init__(self, output_path: str = "voice_data.h5"):
+        self.output_path = Path(output_path)
+        self.compression = 'gzip'  # 압축 알고리즘
+        self.compression_opts = 6  # 압축 레벨 (0-9)
+    def convert_voicebank_to_hdf5(self, voicebank_path: Union[str, Path]) -> bool:
+        """보이스뱅크를 HDF5 형태로 변환"""
+        try:
+            voicebank_path = Path(voicebank_path)
+            logger.info(f"보이스뱅크 변환 시작: {voicebank_path}")
+            # VoicebankManager로 기존 데이터 로드
+            vb_manager = VoicebankManager(voicebank_path)
+            # HDF5 파일 생성
+            with h5py.File(self.output_path, 'w') as h5file:
+                # 메타데이터 그룹
+                meta_group = h5file.create_group('metadata')
+                # oto.ini 정보 저장
+                oto_data = {}
+                for alias, entry in vb_manager.oto_entries.items():
+                    oto_data[alias] = {
+                        'filename': entry.filename,
+                        'alias': entry.alias,
+                        'offset': entry.offset,
+                        'consonant': entry.consonant,
+                        'cutoff': entry.cutoff,
+                        'preutterance': entry.preutterance,
+                        'overlap': entry.overlap
+                    }
+                # JSON으로 직렬화하여 압축 저장
+                oto_json = json.dumps(oto_data, ensure_ascii=False, indent=2)
+                oto_compressed = gzip.compress(oto_json.encode('utf-8'))
+                meta_group.create_dataset('oto_data', data=np.frombuffer(oto_compressed, dtype=np.uint8))
+                # 보이스뱅크 정보
+                meta_group.attrs['voicebank_name'] = voicebank_path.name
+                meta_group.attrs['total_entries'] = len(vb_manager.oto_entries)
+                meta_group.attrs['total_wav_files'] = len(vb_manager.wav_files)
+                # 오디오 데이터 그룹
+                audio_group = h5file.create_group('audio_data')
+                # 각 WAV 파일 처리
+                processed_files = set()
+                total_original_size = 0
+                for filename, wav_path in vb_manager.wav_files.items():
+                    if filename in processed_files:
+                        continue
+                    try:
+                        # 오디오 로드
+                        audio_data, sample_rate = sf.read(wav_path)
+                        # 파일 크기 계산
+                        total_original_size += wav_path.stat().st_size
+                        # 스테레오 → 모노 변환
+                        if len(audio_data.shape) > 1:
+                            audio_data = np.mean(audio_data, axis=1)
+                        # 파일별 그룹 생성
+                        file_group = audio_group.create_group(filename.replace('.wav', ''))
+                        # 오디오 데이터 저장 (압축 적용)
+                        file_group.create_dataset(
+                            'audio',
+                            data=audio_data.astype(np.float32),
+                            compression=self.compression,
+                            compression_opts=self.compression_opts,
+                            shuffle=True,  # 압축 효율 향상
+                            fletcher32=True  # 체크섬 추가
+                        )
+                        # 메타데이터 저장
+                        file_group.attrs['sample_rate'] = sample_rate
+                        file_group.attrs['duration'] = len(audio_data) / sample_rate
+                        file_group.attrs['original_filename'] = filename
+                        processed_files.add(filename)
+                        logger.info(f"변환 완료: {filename} ({len(audio_data)} samples)")
+                    except Exception as e:
+                        logger.error(f"파일 처리 실패 {wav_path}: {e}")
+                        continue
+                # 압축 통계
+                compressed_size = self.output_path.stat().st_size
+                compression_ratio = (1 - compressed_size / total_original_size) * 100
+                meta_group.attrs['original_size_bytes'] = total_original_size
+                meta_group.attrs['compressed_size_bytes'] = compressed_size
+                meta_group.attrs['compression_ratio_percent'] = compression_ratio
+                logger.info(f"변환 완료!")
+                logger.info(f"원본 크기: {total_original_size / (1024*1024):.1f} MB")
+                logger.info(f"���축 크기: {compressed_size / (1024*1024):.1f} MB")
+                logger.info(f"압축율: {compression_ratio:.1f}%")
+                return True
+        except Exception as e:
+            logger.error(f"HDF5 변환 실패: {e}")
+            return False
+class CompressedVoicebankManager:
+    """압축된 HDF5 보이스뱅크를 관리하는 클래스"""
+    def __init__(self, hdf5_path: Union[str, Path]):
+        self.hdf5_path = Path(hdf5_path)
+        self.oto_entries: Dict[str, OtoEntry] = {}
+        self._audio_cache: Dict[str, Tuple[np.ndarray, int]] = {}
+        self.cache_size_limit = 50  # 캐시할 최대 오디오 파일 수
+        if not self.hdf5_path.exists():
+            raise FileNotFoundError(f"압축된 보이스뱅크를 찾을 수 없습니다: {hdf5_path}")
+        self.load_metadata()
+    def load_metadata(self):
+        """HDF5에서 메타데이터 로드"""
+        try:
+            with h5py.File(self.hdf5_path, 'r') as h5file:
+                # oto.ini 데이터 로드
+                oto_compressed = h5file['metadata']['oto_data'][:]
+                oto_json = gzip.decompress(oto_compressed.tobytes()).decode('utf-8')
+                oto_data = json.loads(oto_json)
+                # OtoEntry 객체로 변환
+                for alias, data in oto_data.items():
+                    self.oto_entries[alias] = OtoEntry(
+                        filename=data['filename'],
+                        alias=data['alias'],
+                        offset=data['offset'],
+                        consonant=data['consonant'],
+                        cutoff=data['cutoff'],
+                        preutterance=data['preutterance'],
+                        overlap=data['overlap']
+                    )
+                # 메타데이터 로그
+                meta = h5file['metadata']
+                logger.info(f"압축된 보이스뱅크 로드: {meta.attrs['voicebank_name']}")
+                logger.info(f"총 {meta.attrs['total_entries']}개 엔트리")
+                logger.info(f"압축율: {meta.attrs['compression_ratio_percent']:.1f}%")
+        except Exception as e:
+            logger.error(f"메타데이터 로드 실패: {e}")
+            raise
+    def get_audio_data(self, filename: str) -> Optional[Tuple[np.ndarray, int]]:
+        """특정 파일의 오디오 데이터 로드 (캐싱 지원)"""
+        base_filename = filename.replace('.wav', '')
+        # 캐시 확인
+        if base_filename in self._audio_cache:
+            return self._audio_cache[base_filename]
+        try:
+            with h5py.File(self.hdf5_path, 'r') as h5file:
+                if base_filename not in h5file['audio_data']:
+                    return None
+                file_group = h5file['audio_data'][base_filename]
+                audio_data = file_group['audio'][:]
+                sample_rate = file_group.attrs['sample_rate']
+                # 캐시 관리 (LRU 방식)
+                if len(self._audio_cache) >= self.cache_size_limit:
+                    # 가장 오래된 항목 제거
+                    oldest_key = next(iter(self._audio_cache))
+                    del self._audio_cache[oldest_key]
+                # 캐시에 저장
+                result = (audio_data, int(sample_rate))
+                self._audio_cache[base_filename] = result
+                return result
+        except Exception as e:
+            logger.error(f"오디오 데이터 로드 실패 {filename}: {e}")
+            return None
+    def get_sample_for_phoneme(self, phoneme: str) -> Optional[OtoEntry]:
+        """음소에 해당하는 샘플 찾기 (기존 로직과 동일)"""
+        # 정확한 매치 먼저 시도
+        if phoneme in self.oto_entries:
+            return self.oto_entries[phoneme]
+        # 유사한 발음 찾기
+        candidates = []
+        for alias in self.oto_entries:
+            entry = self.oto_entries[alias]
+            if entry.clean_alias == phoneme:
+                candidates.append(entry)
+        if candidates:
+            # 숨소리가 아닌 것을 우선
+            non_breath = [c for c in candidates if not c.is_breath]
+            return non_breath[0] if non_breath else candidates[0]
+        return None
+    def list_available_phonemes(self) -> List[str]:
+        """사용 가능한 음소 목록"""
+        return list(set(entry.clean_alias for entry in self.oto_entries.values()))
+    def get_compression_info(self) -> Dict[str, any]:
+        """압축 정보 반환"""
+        try:
+            with h5py.File(self.hdf5_path, 'r') as h5file:
+                meta = h5file['metadata']
+                return {
+                    'voicebank_name': meta.attrs['voicebank_name'],
+                    'total_entries': meta.attrs['total_entries'],
+                    'original_size_mb': meta.attrs['original_size_bytes'] / (1024*1024),
+                    'compressed_size_mb': meta.attrs['compressed_size_bytes'] / (1024*1024),
+                    'compression_ratio': meta.attrs['compression_ratio_percent'],
+                    'file_path': str(self.hdf5_path)
+                }
+        except Exception as e:
+            logger.error(f"압축 정보 로드 실패: {e}")
+            return {}
+def convert_voicebank_to_compressed_format(voicebank_path: str, output_path: str = None) -> bool:
+    """보이스뱅크를 압축 형태로 변환하는 편의 함수"""
+    if output_path is None:
+        voicebank_name = Path(voicebank_path).name.replace(' ', '_')
+        output_path = f"voice/{voicebank_name}_compressed.h5"
+    converter = VoiceDataCompressor(output_path)
+    return converter.convert_voicebank_to_hdf5(voicebank_path)
+if __name__ == "__main__":
+    # 테스트용 변환
+    success = convert_voicebank_to_compressed_format("voice/hanseol CVC")
+    if success:
+        print("✅ 보이스뱅크 압축 변환 완료!")
+        # 압축된 버전 테스트
+        compressed_vb = CompressedVoicebankManager("voice/hanseol_CVC_compressed.h5")
+        print(f"📊 압축 정보: {compressed_vb.get_compression_info()}")
+        print(f"🎤 사용 가능한 음소: {len(compressed_vb.list_available_phonemes())}개")
+    else:
+        print("❌ 보이스뱅크 압축 실패!")