crlotwhite commited on
Commit
1056960
·
1 Parent(s): 35c6482

Add UTAU WebUI project with LFS support for voice files

Browse files
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+ .DS_Store
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
COMPRESSION_REPORT.md ADDED
@@ -0,0 +1 @@
 
 
1
+
Makefile ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: help setup install compress run clean test dev check-deps status
2
+
3
+ # 기본 설정
4
+ PYTHON := uv run python
5
+ UV := uv
6
+ VOICEBANK_DIR := voice/hanseol CVC
7
+ COMPRESSED_FILE := voice/hanseol_CVC_compressed.h5
8
+ PORT := 7860
9
+
10
+ # 기본 타겟
11
+ help: ## 도움말 출력
12
+ @echo "🎵 UTAU WebUI - 개발 환경 자동화 도구"
13
+ @echo ""
14
+ @echo "📋 사용 가능한 명령어:"
15
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}'
16
+ @echo ""
17
+ @echo "🚀 빠른 시작:"
18
+ @echo " 1. make setup # 개발 환경 설정"
19
+ @echo " 2. make compress # 보이스뱅크 압축"
20
+ @echo " 3. make run # 웹UI 실행"
21
+ @echo ""
22
+
23
+ setup: ## 개발 환경 초기 설정
24
+ @echo "🔧 개발 환경을 설정합니다..."
25
+ @if ! command -v uv >/dev/null 2>&1; then \
26
+ echo "❌ uv가 설치되지 않았습니다. https://docs.astral.sh/uv/ 에서 설치하세요."; \
27
+ exit 1; \
28
+ fi
29
+ @echo "📦 의존성을 설치합니다..."
30
+ $(UV) sync
31
+ @echo "📁 필요한 디렉토리를 생성합니다..."
32
+ @mkdir -p voice
33
+ @echo "✅ 개발 환경 설정 완료!"
34
+
35
+ install: setup ## setup의 별칭
36
+
37
+ check-deps: ## 의존성 및 환경 확인
38
+ @echo "🔍 환경을 확인합니다..."
39
+ @echo "UV 버전: $$($(UV) --version 2>/dev/null || echo '❌ uv 없음')"
40
+ @echo "Python 버전: $$($(PYTHON) --version 2>/dev/null || echo '❌ Python 없음')"
41
+ @if [ -f "$(COMPRESSED_FILE)" ]; then \
42
+ echo "✅ 압축된 보이스뱅크: $(COMPRESSED_FILE)"; \
43
+ $(PYTHON) -c "import h5py; f=h5py.File('$(COMPRESSED_FILE)', 'r'); print(f'📊 메타데이터: {dict(f[\"metadata\"].attrs)}')"; \
44
+ else \
45
+ echo "❌ 압축된 보이스뱅크가 없음: $(COMPRESSED_FILE)"; \
46
+ fi
47
+ @if [ -d "$(VOICEBANK_DIR)" ]; then \
48
+ echo "✅ 원본 보이스뱅크: $(VOICEBANK_DIR) ($$(find "$(VOICEBANK_DIR)" -name "*.wav" | wc -l)개 WAV 파일)"; \
49
+ else \
50
+ echo "❌ 원본 보이스뱅크가 없음: $(VOICEBANK_DIR)"; \
51
+ fi
52
+
53
+ compress: ## 보이스뱅크를 HDF5 형태로 압축
54
+ @echo "🗜️ 보이스뱅크를 압축합니다..."
55
+ @if [ ! -d "$(VOICEBANK_DIR)" ]; then \
56
+ echo "❌ 원본 보이스뱅크를 찾을 수 없습니다: $(VOICEBANK_DIR)"; \
57
+ echo "📋 해결 방법:"; \
58
+ echo " 1. hanseol CVC 보이스뱅크를 $(VOICEBANK_DIR) 에 배치"; \
59
+ echo " 2. 또는 다른 보이스뱅크를 사용하려면:"; \
60
+ echo " make compress VOICEBANK_DIR=your/voicebank/path"; \
61
+ exit 1; \
62
+ fi
63
+ @echo "📁 원본 위치: $(VOICEBANK_DIR)"
64
+ @echo "💾 압축 파일: $(COMPRESSED_FILE)"
65
+ $(PYTHON) -c "\
66
+ from voice_data_converter import convert_voicebank_to_compressed_format; \
67
+ success = convert_voicebank_to_compressed_format('$(VOICEBANK_DIR)', '$(COMPRESSED_FILE)'); \
68
+ print('✅ 압축 완료!' if success else '❌ 압축 실패')"
69
+ @echo "🎉 보이스뱅크 압축이 완료되었습니다!"
70
+
71
+ run: ## 웹UI 실행
72
+ @echo "🚀 UTAU WebUI를 시작합니다..."
73
+ @if [ ! -f "$(COMPRESSED_FILE)" ]; then \
74
+ echo "❌ 압축된 보이스뱅크가 없습니다."; \
75
+ echo "📋 먼저 다음 명령어를 실행하세요: make compress"; \
76
+ exit 1; \
77
+ fi
78
+ @echo "🌐 웹 브라우저에서 http://localhost:$(PORT) 을 열어주세요"
79
+ $(PYTHON) webui.py
80
+
81
+ dev: ## 개발 모드로 실행 (auto-reload)
82
+ @echo "🔧 개발 모드로 UTAU WebUI를 시작합니다..."
83
+ @if [ ! -f "$(COMPRESSED_FILE)" ]; then \
84
+ echo "❌ 압축된 보이스뱅크가 없습니다."; \
85
+ echo "📋 먼저 다음 명령어를 실행하세요: make compress"; \
86
+ exit 1; \
87
+ fi
88
+ @echo "🌐 웹 브라우저에서 http://localhost:$(PORT) 을 열어주세요"
89
+ @echo "🔄 파일 변경 시 자동으로 재시작됩니다"
90
+ $(UV) run --env GRADIO_AUTO_RELOAD=1 python webui.py
91
+
92
+ test: ## 압축된 보이스뱅크 테스트
93
+ @echo "🧪 압축된 보이스뱅크를 테스트합니다..."
94
+ @if [ ! -f "$(COMPRESSED_FILE)" ]; then \
95
+ echo "❌ 압축된 보이스뱅크가 없습니다: $(COMPRESSED_FILE)"; \
96
+ echo "📋 먼저 다음 명령어를 실행하세요: make compress"; \
97
+ exit 1; \
98
+ fi
99
+ $(PYTHON) test_compressed_voicebank.py
100
+
101
+ status: ## 현재 상태 확인
102
+ @echo "📊 UTAU WebUI 상태"
103
+ @echo "===================="
104
+ @make check-deps
105
+ @echo ""
106
+ @if [ -f "$(COMPRESSED_FILE)" ] && [ -d "$(VOICEBANK_DIR)" ]; then \
107
+ echo "🎉 모든 준비가 완료되었습니다! 'make run'으로 시작하세요."; \
108
+ elif [ -f "$(COMPRESSED_FILE)" ]; then \
109
+ echo "✅ 압축된 보이스뱅크가 준비되었습니다! 'make run'으로 시작하세요."; \
110
+ elif [ -d "$(VOICEBANK_DIR)" ]; then \
111
+ echo "⚠️ 보이스뱅크가 있지만 압축되지 않았습니다. 'make compress'를 실행하세요."; \
112
+ else \
113
+ echo "❌ 보이스뱅크가 없습니다. 먼저 보이스뱅크를 준비하고 'make compress'를 실행하세요."; \
114
+ fi
115
+
116
+ clean: ## 임시 파일 및 캐시 정리
117
+ @echo "🧹 임시 파일을 정리합니다..."
118
+ @find . -type f -name "*.pyc" -delete 2>/dev/null || true
119
+ @find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
120
+ @find . -type f -name "*.tmp" -delete 2>/dev/null || true
121
+ @find . -type f -name ".DS_Store" -delete 2>/dev/null || true
122
+ @rm -rf .pytest_cache 2>/dev/null || true
123
+ @echo "✅ 정리 완료!"
124
+
125
+ clean-all: clean ## 모든 생성된 파일 삭제 (압축 파일 포함)
126
+ @echo "🗑️ 모든 생성된 파일을 삭제합니다..."
127
+ @if [ -f "$(COMPRESSED_FILE)" ]; then \
128
+ echo "⚠️ 압축된 보이스뱅크도 삭제됩니다: $(COMPRESSED_FILE)"; \
129
+ read -p "계속하시겠습니까? (y/N): " confirm; \
130
+ if [ "$$confirm" = "y" ] || [ "$$confirm" = "Y" ]; then \
131
+ rm -f "$(COMPRESSED_FILE)"; \
132
+ echo "✅ 모든 파일이 삭제되었습니다."; \
133
+ else \
134
+ echo "❌ 취소되었습니다."; \
135
+ fi \
136
+ else \
137
+ echo "✅ 정리할 파일이 없습니다."; \
138
+ fi
139
+
140
+ # 개발자를 위한 추가 명령어
141
+ update: ## 의존성 업데이트
142
+ @echo "📦 의존성을 업데이트합니다..."
143
+ $(UV) sync --upgrade
144
+ @echo "✅ 업데이트 완료!"
145
+
146
+ shell: ## 프로젝트 쉘 진입
147
+ @echo "🐚 프로젝트 쉘에 진입합니다..."
148
+ $(UV) shell
149
+
150
+ info: ## 프로젝트 정보 출력
151
+ @echo "📋 UTAU WebUI 프로젝트 정보"
152
+ @echo "============================"
153
+ @echo "프로젝트: UTAU WebUI"
154
+ @echo "설명: 한국어 CVC 보이스뱅크를 사용한 웹 기반 UTAU 음성 합성기"
155
+ @echo "기술 스택: Python, Gradio, HDF5, UV"
156
+ @echo "포트: $(PORT)"
157
+ @echo "보이스뱅크: $(VOICEBANK_DIR)"
158
+ @echo "압축 파일: $(COMPRESSED_FILE)"
159
+ @echo ""
160
+ @echo "📁 디렉토리 구조:"
161
+ @find . -maxdepth 2 -type f -name "*.py" | head -10
162
+ @echo ""
163
+
164
+ # 전체 워크플로우
165
+ all: setup compress run ## 전체 설정 및 실행 (setup → compress → run)
166
+
167
+ # 기본 타겟을 help로 설정
168
+ .DEFAULT_GOAL := help
README.md CHANGED
@@ -9,5 +9,215 @@ app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
 
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  pinned: false
10
  license: mit
11
  ---
12
+ # 🎵 UTAU WebUI - 한국어 음성 합성기
13
 
14
+ 피아노롤 기반의 UTAU 음성 합성 시스템입니다. 한국어 CVC 보이스뱅크를 사용하여 자연스러운 한국어 음성을 합성할 수 있습니다.
15
+
16
+ ## ✨ 특징
17
+
18
+ - 🎹 **직관적인 피아노롤 인터페이스**: 웹 브라우저에서 바로 사용 가능한 피아노롤 편집기
19
+ - 🇰🇷 **한국어 음성 합성**: hanseol CVC 보이스뱅크를 사용한 고품질 한국어 음성 합성
20
+ - 🗜️ **HDF5 압축 시스템**: 47.7MB → 33.9MB (29% 압축), 46개 파일 → 1개 파일로 최적화
21
+ - 🚀 **자동화된 워크플로우**: Makefile을 통한 원클릭 설정, 압축, 실행
22
+ - 🎵 **실시간 편집**: 노트 추가, 삭제, 가사 입력이 실시간으로 가능
23
+ - 🔊 **웨이브폼 시각화**: 합성된 음성의 웨이브폼을 피아노롤에서 바로 확인
24
+ - 🎤 **CVC 음소 시스템**: 585개의 한국어 CVC 음소로 자연스러운 발음 구현
25
+ - ☁️ **클라우드 최적화**: Gradio 및 Hugging Face Spaces 환경에 최적화
26
+
27
+ ## 🎤 보이스뱅크 정보
28
+
29
+ - **보이스뱅크**: hanseol CVC (HDF5 압축)
30
+ - **CV (Character Voice)**: KUNGOM
31
+ - **UTAU**: KITANE 백한설
32
+ - **음소 수**: 585개 CVC 음소
33
+ - **언어**: 한국어
34
+ - **압축율**: 29% (원본 47.7MB → 압축 33.9MB)
35
+
36
+ ## 🚀 빠른 시작
37
+
38
+ ### 필요 조건
39
+
40
+ - Python 3.12+
41
+ - [uv](https://docs.astral.sh/uv/) (Python 패키지 관리자)
42
+ - GNU Make (자동화 스크립트용)
43
+
44
+ ### uv 설치
45
+
46
+ ```bash
47
+ # macOS/Linux
48
+ curl -LsSf https://astral.sh/uv/install.sh | sh
49
+
50
+ # Windows
51
+ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
52
+ ```
53
+
54
+ ### 🎯 한 번에 설정하고 실행하기
55
+
56
+ ```bash
57
+ # 저장소 클론
58
+ git clone <repository-url>
59
+ cd utau-webui
60
+
61
+ # 모든 설정과 실행을 한 번에
62
+ make all
63
+ ```
64
+
65
+ ### 📋 단계별 실행
66
+
67
+ ```bash
68
+ # 1. 개발 환경 설정
69
+ make setup
70
+
71
+ # 2. 보이스뱅크 압축 (최초 1회만)
72
+ make compress
73
+
74
+ # 3. 웹UI 실행
75
+ make run
76
+ ```
77
+
78
+ ### 🔍 현재 상태 확인
79
+
80
+ ```bash
81
+ # 프로젝트 상태 확인
82
+ make status
83
+
84
+ # 의존성 및 환경 확인
85
+ make check-deps
86
+ ```
87
+
88
+ ## 📋 Makefile 명령어
89
+
90
+ | 명령어 | 설명 |
91
+ |--------|------|
92
+ | `make help` | 사용 가능한 모든 명령어 표시 |
93
+ | `make setup` | 개발 환경 초기 설정 (의존성 설치) |
94
+ | `make compress` | 보이스뱅크를 HDF5 형태로 압축 |
95
+ | `make run` | 웹UI 실행 |
96
+ | `make dev` | 개발 모드로 실행 (auto-reload) |
97
+ | `make test` | 압축된 보이스뱅크 테스트 |
98
+ | `make status` | 현재 프로젝트 상태 확인 |
99
+ | `make clean` | 임시 파일 및 캐시 정리 |
100
+ | `make all` | 전체 설정 및 실행 (setup → compress → run) |
101
+
102
+ ## 🎼 사용법
103
+
104
+ 1. **노트 추가**: 피아노롤에서 원하는 위치를 클릭하여 노트 추가
105
+ 2. **가사 입력**: 노트를 더블클릭하여 한국어 가사 입력
106
+ 3. **노트 편집**: 드래그하여 노트 길이 조정, 위아래로 드래그하여 음높이 조정
107
+ 4. **음성 합성**: "🎵 음성 합성" 버튼 클릭하여 음성 생성
108
+ 5. **재생**: 생성된 음성을 바로 들어보거나 다운로드
109
+
110
+ ### 지원하는 한국어 음소
111
+
112
+ - **기본 모음**: 아, 이, 우, 에, 오, 으, 어
113
+ - **자음+모음 조합**: 바, 다, 가, 하, 자, 카, 라, 마, 나, 파, 사, 타 등
114
+ - **복합 모음**: 야, 예, 여, 요, 유, 의, 와, 웨, 위, 워
115
+ - **도레미 음계**: 도, 레, 미, 파, 솔, 라, 시
116
+
117
+ ## 🗜️ HDF5 압축 시스템
118
+
119
+ ### 장점
120
+ - **파일 관리 최적화**: 46개 WAV 파일 → 1개 HDF5 파일
121
+ - **용량 최적화**: 29% 압축 효율 (47.7MB → 33.9MB)
122
+ - **성능 향상**: 더 빠른 로딩 및 배포
123
+ - **클라우드 친화적**: Hugging Face Spaces 등 클라우드 환경에 최적화
124
+
125
+ ### 압축 과정
126
+ ```bash
127
+ # 자동 압축
128
+ make compress
129
+
130
+ # 수동 압축
131
+ uv run python -c "from voice_data_converter import convert_voicebank_to_compressed_format; convert_voicebank_to_compressed_format('voice/hanseol CVC')"
132
+ ```
133
+
134
+ ## 🛠️ 기술 스택
135
+
136
+ - **Frontend**: Gradio + Custom PianoRoll Component
137
+ - **Backend**: Python
138
+ - **음성 합성**: UTAU Engine + Straycat Resampler
139
+ - **데이터 압축**: HDF5 (with gzip compression)
140
+ - **오디오 처리**: SoundFile, NumPy
141
+ - **패키지 관리**: uv
142
+ - **자동화**: GNU Make
143
+
144
+ ## 🔧 개발하기
145
+
146
+ ### 개발 환경 설정
147
+ ```bash
148
+ # 전체 개발 환경 설정
149
+ make setup
150
+
151
+ # 개발 모드로 실행 (파일 변경 시 자동 재시작)
152
+ make dev
153
+
154
+ # 쉘 진입
155
+ make shell
156
+ ```
157
+
158
+ ### 의존성 관리
159
+ ```bash
160
+ # 의존성 추가
161
+ uv add <package-name>
162
+
163
+ # 개발 의존성 추가
164
+ uv add --dev <package-name>
165
+
166
+ # 의존성 업데이트
167
+ make update
168
+ ```
169
+
170
+ ### 프로젝트 정보
171
+ ```bash
172
+ # 프로젝트 정보 확인
173
+ make info
174
+
175
+ # 의존성 트리 확인
176
+ uv tree
177
+ ```
178
+
179
+ ## 🚨 문제 해결
180
+
181
+ ### 압축된 보이스뱅크가 없는 경우
182
+ ```bash
183
+ # 상태 확인
184
+ make status
185
+
186
+ # 보이스뱅크 압축
187
+ make compress
188
+ ```
189
+
190
+ ### 원본 보이스뱅크가 없는 경우
191
+ 1. hanseol CVC 보이스뱅크를 `voice/hanseol CVC` 디렉토리에 배치
192
+ 2. `make compress` 실행
193
+
194
+ ### 환경 문제
195
+ ```bash
196
+ # 환경 확인
197
+ make check-deps
198
+
199
+ # 의존성 재설치
200
+ make setup
201
+ ```
202
+
203
+ ## 📝 저작권 및 라이선스
204
+
205
+ ### UTAU WebUI
206
+ 본 프로젝트는 오픈소스 소프트웨어입니다.
207
+
208
+ ### Straycat Resampler
209
+ 본 프로젝트에서 사용하는 UTAU 리샘플러는 [straycat](https://github.com/UtaUtaUtau/straycat)을 기반으로 합니다.
210
+
211
+ **원본 저장소**: https://github.com/UtaUtaUtau/straycat
212
+ **라이선스**: MIT License
213
+ **저작권**: Copyright (c) UtaUtaUtau
214
+
215
+ > Yet another WORLD-based UTAU resampler.
216
+
217
+ MIT 라이선스에 따라 본 프로젝트에 포함되었으며, 원본 저작권 표시를 유지합니다.
218
+
219
+ ### hanseol CVC 보이스뱅크
220
+ - **CV**: KUNGOM
221
+ - **UTAU**: KITANE 백한설
222
+
223
+ 해당 보이스뱅크의 사용 권한에 대해서는 원 저작권자의 이용 약관을 따릅니다.
app.py ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import gradio_pianoroll as grp
3
+ import tempfile
4
+ import os
5
+ import numpy as np
6
+ import soundfile as sf
7
+ from pathlib import Path
8
+ from straycat import Resampler
9
+ import logging
10
+ import json
11
+ import base64
12
+ import io
13
+ import wave
14
+ from compressed_utau_engine import CompressedUTAUEngine
15
+ import os
16
+
17
+ # 로깅 설정
18
+ logging.basicConfig(level=logging.INFO)
19
+
20
+ # 압축된 보이스뱅크만 사용
21
+ utau_engine = None
22
+ USE_UTAU = False
23
+
24
+ # 압축된 hanseol CVC 보이스뱅크 경로
25
+ compressed_hanseol_path = "voice/hanseol_CVC_compressed.h5"
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # 압축된 보이스뱅크 로드 (필수)
30
+ if os.path.exists(compressed_hanseol_path):
31
+ try:
32
+ utau_engine = CompressedUTAUEngine(compressed_hanseol_path)
33
+ USE_UTAU = True
34
+ available_phonemes = utau_engine.get_available_phonemes()
35
+ compression_info = utau_engine.get_compression_info()
36
+ logger.info(f"✅ 압축된 hanseol CVC 보이스뱅크 로드 완료: {len(available_phonemes)}개 음소")
37
+ logger.info(f"📊 압축율: {compression_info.get('compression_ratio', 0):.1f}%")
38
+ logger.info(f"💾 압축된 파일 크기: {compression_info.get('compressed_size_bytes', 0) / (1024*1024):.1f} MB")
39
+ except Exception as e:
40
+ logger.error(f"❌ 압축된 보이스뱅크 로드 실패: {e}")
41
+ print(f"\n{'='*60}")
42
+ print("🚨 압축된 보이스뱅크 로드 실패!")
43
+ print(f"파일 경로: {compressed_hanseol_path}")
44
+ print(f"오류: {e}")
45
+ print("\n📋 해결 방법:")
46
+ print("1. 다음 명령어로 보이스뱅크를 압축하세요:")
47
+ print(" make compress")
48
+ print("2. 또는 수동으로 실행:")
49
+ print(" uv run python voice_data_converter.py")
50
+ print(f"{'='*60}\n")
51
+ USE_UTAU = False
52
+ else:
53
+ logger.error(f"❌ 압축된 보이스뱅크 파일을 찾을 수 없음: {compressed_hanseol_path}")
54
+ print(f"\n{'='*60}")
55
+ print("🚨 압축된 보이스뱅크 파일이 없습니다!")
56
+ print(f"예상 위치: {compressed_hanseol_path}")
57
+ print("\n📋 해결 방법:")
58
+ print("1. 원본 보이스뱅크가 있다면 압축하세요:")
59
+ print(" make compress")
60
+ print("2. 또는 수동으로 실행:")
61
+ print(" uv run python -c \"from voice_data_converter import convert_voicebank_to_compressed_format; convert_voicebank_to_compressed_format('voice/hanseol CVC')\"")
62
+ print("\n3. 보이스뱅크 다운로드가 필요한 경우:")
63
+ print(" - hanseol CVC 보이스뱅크를 voice/ 디렉토리에 배치")
64
+ print(" - 그 후 위의 압축 명령어 실행")
65
+ print(f"{'='*60}\n")
66
+ USE_UTAU = False
67
+
68
+ # 압축된 보이스뱅크가 없으면 경고 메시지와 함께 제한된 기능만 제공
69
+ if not USE_UTAU:
70
+ available_phonemes = []
71
+ logger.warning("⚠️ 압축된 보이스뱅크 없이 제한된 모드로 실행됩니다.")
72
+ logger.warning("⚠️ 음성 합성 기능을 사용하려면 먼저 보이스뱅크를 압축하세요.")
73
+
74
+ def audio_to_base64_wav(audio_data, sample_rate):
75
+ """Convert audio data to base64 encoded WAV string"""
76
+ if audio_data is None or len(audio_data) == 0:
77
+ return None
78
+
79
+ # Normalize audio data to [-1, 1] range
80
+ if np.max(np.abs(audio_data)) > 0:
81
+ audio_data = audio_data / np.max(np.abs(audio_data))
82
+
83
+ # Convert to 16-bit PCM
84
+ audio_16bit = (audio_data * 32767).astype(np.int16)
85
+
86
+ # Create WAV file in memory
87
+ buffer = io.BytesIO()
88
+ with wave.open(buffer, 'wb') as wav_file:
89
+ wav_file.setnchannels(1) # Mono
90
+ wav_file.setsampwidth(2) # 16-bit
91
+ wav_file.setframerate(sample_rate)
92
+ wav_file.writeframes(audio_16bit.tobytes())
93
+
94
+ # base64 encoding
95
+ buffer.seek(0)
96
+ wav_data = buffer.read()
97
+ base64_data = base64.b64encode(wav_data).decode('utf-8')
98
+
99
+ return f"data:audio/wav;base64,{base64_data}"
100
+
101
+ def calculate_waveform_data(audio_data, pixels_per_beat, tempo, target_width=1000):
102
+ """Calculate waveform visualization data from audio data"""
103
+ if audio_data is None or len(audio_data) == 0:
104
+ return None
105
+
106
+ sample_rate = 44100
107
+
108
+ # Calculate total audio duration (seconds)
109
+ audio_duration = len(audio_data) / sample_rate
110
+
111
+ # Calculate total pixel length (based on tempo and pixels per beat)
112
+ total_pixels = (tempo / 60) * pixels_per_beat * audio_duration
113
+
114
+ # Calculate samples per pixel
115
+ samples_per_pixel = len(audio_data) / total_pixels
116
+
117
+ waveform_points = []
118
+
119
+ # Calculate min/max values for each pixel
120
+ for pixel in range(int(total_pixels)):
121
+ start_sample = int(pixel * samples_per_pixel)
122
+ end_sample = int((pixel + 1) * samples_per_pixel)
123
+ end_sample = min(end_sample, len(audio_data))
124
+
125
+ if start_sample >= len(audio_data):
126
+ break
127
+
128
+ if start_sample < end_sample:
129
+ # Audio data for the pixel range
130
+ pixel_data = audio_data[start_sample:end_sample]
131
+
132
+ # Calculate min, max values
133
+ min_val = float(np.min(pixel_data))
134
+ max_val = float(np.max(pixel_data))
135
+
136
+ # Time information (pixel position)
137
+ time_position = pixel
138
+
139
+ waveform_points.append({
140
+ 'x': time_position,
141
+ 'min': min_val,
142
+ 'max': max_val
143
+ })
144
+
145
+ return waveform_points
146
+
147
+ def add_waveform_to_pianoroll(pianoroll_data, audio_data, sample_rate, tempo):
148
+ """Add waveform data to pianoroll for visualization - demo/app.py와 동일한 방식"""
149
+ # demo/app.py와 동일한 방식으로 완전히 복사
150
+ updated_pianoroll = pianoroll_data.copy() if pianoroll_data else {}
151
+
152
+ # Add backend audio data
153
+ audio_base64 = audio_to_base64_wav(audio_data, sample_rate)
154
+ updated_pianoroll['audio_data'] = audio_base64
155
+ updated_pianoroll['use_backend_audio'] = True
156
+
157
+ # Get tempo and pixels per beat from pianoroll data
158
+ pixels_per_beat = updated_pianoroll.get('pixelsPerBeat', 80)
159
+
160
+ # Calculate waveform data
161
+ waveform_data = calculate_waveform_data(audio_data, pixels_per_beat, tempo)
162
+
163
+ # demo/app.py와 동일한 curve_data 처리 방식
164
+ curve_data = {}
165
+
166
+ # Add waveform data to curve_data
167
+ if waveform_data:
168
+ curve_data['waveform_data'] = waveform_data
169
+ print(f"Waveform data created: {len(waveform_data)} points")
170
+
171
+ # Set curve data for piano roll (demo/app.py와 동일)
172
+ if curve_data:
173
+ updated_pianoroll['curve_data'] = curve_data
174
+
175
+ # demo/app.py와 같은 방식으로 segment_data 추가
176
+ if 'notes' in updated_pianoroll and updated_pianoroll['notes']:
177
+ segment_data = []
178
+
179
+ for i, note in enumerate(updated_pianoroll['notes']):
180
+ start_seconds = note.get('startSeconds', 0)
181
+ duration_seconds = note.get('durationSeconds', 0.5)
182
+
183
+ segment_data.append({
184
+ 'start': start_seconds,
185
+ 'end': start_seconds + duration_seconds,
186
+ 'type': 'note',
187
+ 'value': note.get('lyric', f"Note_{i+1}"),
188
+ 'confidence': 0.95
189
+ })
190
+
191
+ updated_pianoroll['segment_data'] = segment_data
192
+
193
+ # 상세한 디버깅 로그 (demo/app.py와 동일한 형식)
194
+ print(f"🔊 [add_waveform_to_pianoroll] Setting backend audio data:")
195
+ print(f" - audio_data length: {len(audio_base64) if audio_base64 else 0}")
196
+ print(f" - use_backend_audio: {updated_pianoroll['use_backend_audio']}")
197
+ print(f" - waveform points: {len(waveform_data) if waveform_data else 0}")
198
+ print(f" - Updated pianoroll keys: {list(updated_pianoroll.keys())}")
199
+
200
+ return updated_pianoroll
201
+
202
+ def create_test_voice_sample():
203
+ """테스트용 간단한 음성 샘플 생성"""
204
+ voice_dir = Path("voice")
205
+ voice_dir.mkdir(exist_ok=True)
206
+
207
+ sample_path = voice_dir / "test_voice.wav"
208
+
209
+ if not sample_path.exists():
210
+ # 간단한 사인파 기반 음성 샘플 생성 (A4 = 440Hz)
211
+ duration = 1.0 # 1초
212
+ sample_rate = 44100
213
+ t = np.linspace(0, duration, int(sample_rate * duration), False)
214
+
215
+ # 기본 주파수 (A4)
216
+ fundamental = 440.0
217
+
218
+ # 하모닉을 추가한 더 자연스러운 소리
219
+ signal = (np.sin(2 * np.pi * fundamental * t) * 0.5 +
220
+ np.sin(2 * np.pi * fundamental * 2 * t) * 0.2 +
221
+ np.sin(2 * np.pi * fundamental * 3 * t) * 0.1 +
222
+ np.sin(2 * np.pi * fundamental * 4 * t) * 0.05)
223
+
224
+ # ADSR 엔벨로프 적용
225
+ attack = int(0.05 * sample_rate)
226
+ decay = int(0.1 * sample_rate)
227
+ sustain_level = 0.7
228
+ release = int(0.2 * sample_rate)
229
+ sustain = len(signal) - attack - decay - release
230
+
231
+ envelope = np.ones_like(signal)
232
+ envelope[:attack] = np.linspace(0, 1, attack)
233
+ envelope[attack:attack+decay] = np.linspace(1, sustain_level, decay)
234
+ envelope[attack+decay:attack+decay+sustain] = sustain_level
235
+ envelope[-release:] = np.linspace(sustain_level, 0, release)
236
+
237
+ signal = signal * envelope
238
+
239
+ # 포먼트 필터 추가 (간단한 음성 특성)
240
+ from scipy import signal as scipy_signal
241
+
242
+ # 음성 특성을 모방한 간단한 필터
243
+ b, a = scipy_signal.butter(2, [300, 3000], btype='band', fs=sample_rate)
244
+ signal = scipy_signal.filtfilt(b, a, signal)
245
+
246
+ # 노이즈 추가로 더 자연스럽게
247
+ noise = np.random.normal(0, 0.02, len(signal))
248
+ signal = signal + noise
249
+
250
+ # 정규화
251
+ signal = signal / np.max(np.abs(signal)) * 0.8
252
+
253
+ sf.write(sample_path, signal, sample_rate)
254
+ logging.info(f"테스트 음성 샘플 생성: {sample_path}")
255
+
256
+ return sample_path
257
+
258
+ def midi_to_note_name(midi_note):
259
+ """MIDI 노트 번호를 노트 이름으로 변환"""
260
+ notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
261
+ octave = (midi_note // 12) - 1
262
+ note = notes[midi_note % 12]
263
+ return f"{note}{octave}"
264
+
265
+ def synthesize_notes(pianoroll_data, use_utau_engine):
266
+ """피아노롤 데이터를 받아서 음성을 합성하고 웨이브폼을 피아노롤에 업데이트"""
267
+ if not pianoroll_data or not pianoroll_data.get('notes'):
268
+ return pianoroll_data, None, "노트가 없습니다. 피아노롤에 노트를 추가하세요."
269
+
270
+ try:
271
+ notes = pianoroll_data['notes']
272
+
273
+ # 기본값 설정
274
+ velocity_setting = 100
275
+ volume_setting = 100
276
+ use_vibrato = False
277
+ vibrato_depth = 20
278
+
279
+ # 피아노롤에서 tempo 가져오기 (기본값: 120)
280
+ tempo = pianoroll_data.get('tempo', 120)
281
+ logging.info(f"합성할 노트 수: {len(notes)}, 템포: {tempo} BPM (피아노롤에서 가져옴)")
282
+
283
+ # 피아노롤 노트에서 가사 추출
284
+ lyrics = []
285
+ for note in notes:
286
+ lyric = note.get('lyric', '').strip()
287
+ if not lyric:
288
+ lyric = "あ" # 가사가 없으면 기본 일본어 음소
289
+ lyrics.append(lyric)
290
+
291
+ logging.info(f"추출된 가사: {lyrics}")
292
+
293
+ # UTAU 엔진 사용 여부 결정
294
+ use_utau = use_utau_engine.startswith("UTAU 엔진")
295
+ if use_utau and USE_UTAU and utau_engine:
296
+ # UTAU 엔진으로 합성
297
+ audio_file, status = utau_engine.synthesize_sequence(
298
+ notes=notes,
299
+ lyrics=lyrics,
300
+ tempo=tempo, # 피아노롤의 tempo 사용
301
+ volume=volume_setting
302
+ )
303
+
304
+ if audio_file:
305
+ # 합성된 오디오 로드
306
+ audio_data, sample_rate = sf.read(audio_file)
307
+
308
+ # 피아노롤에 웨이브폼 데이터 추가
309
+ updated_pianoroll = add_waveform_to_pianoroll(
310
+ pianoroll_data, audio_data, sample_rate, tempo # 피아노롤의 tempo 사용
311
+ )
312
+
313
+ return updated_pianoroll, audio_file, status
314
+ else:
315
+ return pianoroll_data, None, status
316
+
317
+ # 기본 엔진으로 합성 (기존 코드)
318
+ # 테스트 음성 샘플 준비
319
+ voice_sample = create_test_voice_sample()
320
+
321
+ # 전체 길이 계산 (가장 늦게 끝나는 노트 기준) - 초 단위로 계산
322
+ max_end_time_seconds = max(note.get('endSeconds', note.get('startSeconds', 0) + note.get('durationSeconds', 0.5)) for note in notes)
323
+ max_end_time = max_end_time_seconds * 1000 # 밀리초로 변환
324
+ sample_rate = 44100
325
+ total_samples = int(max_end_time * sample_rate / 1000) + sample_rate # 여유분 추가
326
+
327
+ # 최종 오디오 버퍼
328
+ final_audio = np.zeros(total_samples)
329
+
330
+ # 각 노트를 개별적으로 합성하고 믹싱
331
+ for i, note in enumerate(notes):
332
+ try:
333
+ pitch = note['pitch']
334
+ start_ms = note.get('startSeconds', 0) * 1000 # 초를 밀리초로 변환
335
+ duration_ms = note.get('durationSeconds', 0.5) * 1000 # 초를 밀리초로 변환
336
+ velocity = note.get('velocity', velocity_setting)
337
+
338
+ note_name = midi_to_note_name(pitch)
339
+ start_seconds = note.get('startSeconds', 0)
340
+ logging.info(f"노트 {i+1}: {note_name} (MIDI {pitch}), 시작: {start_seconds}s ({start_ms}ms), 길이: {duration_ms}ms")
341
+
342
+ # 임시 출력 파일
343
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
344
+ temp_output = temp_file.name
345
+
346
+ # 플래그 설정
347
+ flags = ''
348
+ if use_vibrato:
349
+ flags += f'A{int(vibrato_depth)}'
350
+
351
+ try:
352
+ # straycat Resampler로 합성 (length를 노트 길이에 맞게 설정)
353
+ resampler = Resampler(
354
+ in_file=str(voice_sample),
355
+ out_file=temp_output,
356
+ pitch=note_name,
357
+ velocity=velocity,
358
+ length=max(duration_ms, 200), # 최소 200ms 보장
359
+ volume=volume_setting,
360
+ flags=flags,
361
+ offset=0,
362
+ consonant=20, # 약간의 자연스러운 어택
363
+ cutoff=0,
364
+ modulation=10, # 약간의 모듈레이션
365
+ tempo=f'!{int(tempo)}' # 피아노롤의 tempo 사용
366
+ )
367
+
368
+ # 합성된 오디오 로드
369
+ if os.path.exists(temp_output):
370
+ synth_audio, _ = sf.read(temp_output)
371
+
372
+ # 오디오를 올바른 위치에 배치
373
+ start_sample = int(start_ms * sample_rate / 1000)
374
+ end_sample = start_sample + len(synth_audio)
375
+
376
+ if end_sample <= len(final_audio):
377
+ final_audio[start_sample:end_sample] += synth_audio * (velocity / 100)
378
+ else:
379
+ # 버퍼가 부족하면 확장
380
+ new_size = end_sample + sample_rate
381
+ new_final_audio = np.zeros(new_size)
382
+ new_final_audio[:len(final_audio)] = final_audio
383
+ new_final_audio[start_sample:end_sample] += synth_audio * (velocity / 100)
384
+ final_audio = new_final_audio
385
+
386
+ logging.info(f"노트 {i+1} 합성 완료")
387
+
388
+ except Exception as e:
389
+ logging.error(f"노트 {i+1} 합성 실패: {e}")
390
+ continue
391
+
392
+ finally:
393
+ # 임시 파일 정리
394
+ if os.path.exists(temp_output):
395
+ os.unlink(temp_output)
396
+
397
+ except Exception as e:
398
+ logging.error(f"노트 {i+1} 처리 실패: {e}")
399
+ continue
400
+
401
+ # 최종 오디오 정규화 및 마스터링
402
+ if np.max(np.abs(final_audio)) > 0:
403
+ # 컴프레서 효과 (간단한 버전)
404
+ threshold = 0.7
405
+ ratio = 4.0
406
+
407
+ # 피크 검출
408
+ abs_audio = np.abs(final_audio)
409
+ over_threshold = abs_audio > threshold
410
+
411
+ # 컴프레션 적용
412
+ compressed = final_audio.copy()
413
+ compressed[over_threshold] = (
414
+ np.sign(final_audio[over_threshold]) *
415
+ (threshold + (abs_audio[over_threshold] - threshold) / ratio)
416
+ )
417
+
418
+ final_audio = compressed / np.max(np.abs(compressed)) * 0.85
419
+
420
+ # 최종 파일 저장
421
+ output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
422
+ sf.write(output_file.name, final_audio, sample_rate)
423
+ output_file.close()
424
+
425
+ # 피아노롤에 웨이브폼 데이터 추가
426
+ updated_pianoroll = add_waveform_to_pianoroll(
427
+ pianoroll_data, final_audio, sample_rate, tempo # 피아노롤의 tempo 사용
428
+ )
429
+
430
+ duration_sec = len(final_audio) / sample_rate
431
+ logging.info(f"합성 완료: {len(notes)}개 노트, 총 길이: {duration_sec:.2f}초, 템포: {tempo} BPM")
432
+ return updated_pianoroll, output_file.name, f"✅ 합성 완료: {len(notes)}개 노트, {duration_sec:.1f}초, 템포: {tempo} BPM"
433
+
434
+ except Exception as e:
435
+ error_msg = f"❌ 합성 중 오류 발생: {str(e)}"
436
+ logging.error(error_msg)
437
+ return pianoroll_data, None, error_msg
438
+
439
+ def create_example_melody():
440
+ """예제 멜로디 생성 - demo/app.py와 동일한 방식"""
441
+ # demo/app.py와 동일한 노트 구조 사용 (id 추가)
442
+ notes = [
443
+ {
444
+ "id": "note_0",
445
+ "start": 0,
446
+ "duration": 160,
447
+ "pitch": 60, # C4
448
+ "velocity": 100,
449
+ "lyric": "도",
450
+ "startSeconds": 0.0,
451
+ "durationSeconds": 0.5,
452
+ "endSeconds": 0.5
453
+ },
454
+ {
455
+ "id": "note_1",
456
+ "start": 160,
457
+ "duration": 160,
458
+ "pitch": 62, # D4
459
+ "velocity": 100,
460
+ "lyric": "레",
461
+ "startSeconds": 0.5,
462
+ "durationSeconds": 0.5,
463
+ "endSeconds": 1.0
464
+ },
465
+ {
466
+ "id": "note_2",
467
+ "start": 320,
468
+ "duration": 160,
469
+ "pitch": 64, # E4
470
+ "velocity": 100,
471
+ "lyric": "미",
472
+ "startSeconds": 1.0,
473
+ "durationSeconds": 0.5,
474
+ "endSeconds": 1.5
475
+ },
476
+ {
477
+ "id": "note_3",
478
+ "start": 480,
479
+ "duration": 160,
480
+ "pitch": 67, # G4
481
+ "velocity": 100,
482
+ "lyric": "솔",
483
+ "startSeconds": 1.5,
484
+ "durationSeconds": 0.5,
485
+ "endSeconds": 2.0
486
+ }
487
+ ]
488
+
489
+ # demo/app.py와 동일한 완전한 초기값 구조
490
+ initial_value = {
491
+ "notes": notes,
492
+ "tempo": 120,
493
+ "timeSignature": {"numerator": 4, "denominator": 4},
494
+ "editMode": "select",
495
+ "snapSetting": "1/4",
496
+ "pixelsPerBeat": 80, # demo/app.py와 동일
497
+ "curve_data": {},
498
+ "use_backend_audio": True # demo/app.py와 동일하게 True
499
+ }
500
+
501
+ print("🎼 예제 멜로디 생성됨")
502
+ return initial_value
503
+
504
+ # Gradio 인터페이스
505
+ with gr.Blocks(title="UTAU WebUI", theme=gr.themes.Soft()) as demo:
506
+ gr.Markdown("# 🎵 UTAU WebUI - Gradio로 구현된 UTAU 음성 합성기")
507
+ gr.Markdown("피아노롤에서 노트를 그리고 '음성 합성' 버튼을 클릭하여 음성을 생성하세요.")
508
+ gr.Markdown("## 🎤 주의 사항")
509
+ gr.Markdown("""
510
+ - 유닛 선택 알고리즘이 CVC 음원에 최적화 되지 않아서 발음 오류가 발생할 수 있습니다.
511
+ - 일부 UI 오류가 있으며 현재 해결 중 입니다.
512
+ - 템포가 적용되�� 않는 문제가 있습니다. 120bpm을 유지하면서 사용해주세요.""")
513
+
514
+ with gr.Row():
515
+ with gr.Column(scale=3):
516
+ # demo/app.py와 동일한 초기값 구조
517
+ initial_pianoroll_value = {
518
+ "notes": [],
519
+ "tempo": 120,
520
+ "timeSignature": {"numerator": 4, "denominator": 4},
521
+ "editMode": "select",
522
+ "snapSetting": "1/4",
523
+ "pixelsPerBeat": 80,
524
+ "curve_data": {},
525
+ "use_backend_audio": True # demo/app.py와 동일하게 True
526
+ }
527
+
528
+ pianoroll = grp.PianoRoll(
529
+ width=1000,
530
+ height=800,
531
+ label="피아노롤 편집기",
532
+ value=initial_pianoroll_value,
533
+ elem_id="piano_roll_utau", # 고유 ID 추가
534
+ use_backend_audio=True # demo/app.py와 동일하게 True로 시작
535
+ )
536
+
537
+ with gr.Row():
538
+ clear_btn = gr.Button("🗑️ 초기화", size="sm")
539
+ example_btn = gr.Button("🎼 예제 멜로디", size="sm", variant="secondary")
540
+ info_text = gr.Markdown("**사용법:** 클릭하여 노트 추가, 드래그하여 길이 조정, 더블클릭하여 가사 입력")
541
+
542
+ with gr.Column(scale=1):
543
+ # 엔진 선택 및 가사 입력
544
+ gr.Markdown("### 🎤 음성 엔진")
545
+ with gr.Group():
546
+ # UTAU 엔진 선택지를 동적으로 생성
547
+ engine_choices = []
548
+ if USE_UTAU and utau_engine:
549
+ engine_choices.append(f"UTAU 엔진 ({'hanseol CVC'})")
550
+ engine_choices.append("기본 엔진")
551
+
552
+ engine_radio = gr.Radio(
553
+ choices=engine_choices,
554
+ value=f"UTAU 엔진 ({'hanseol CVC'})" if USE_UTAU and utau_engine else "기본 엔진",
555
+ label="합성 엔진",
556
+ info="UTAU 엔진은 실제 보이스뱅크 사용"
557
+ )
558
+
559
+ # 가사는 피아노롤 노트에서 직접 입력
560
+ gr.Markdown("**가사 입력**: 피아노롤에서 노트를 더블클릭하여 가사를 입력하세요.")
561
+
562
+ synthesis_btn = gr.Button("🎵 음성 합성", variant="primary", size="lg")
563
+ status_text = gr.Textbox(
564
+ label="합성 상태",
565
+ value="노트를 추가하고 합성 버튼을 클릭하세요.",
566
+ interactive=False,
567
+ lines=2
568
+ )
569
+
570
+ audio_output = gr.Audio(
571
+ label="합성된 음성",
572
+ visible=True
573
+ )
574
+
575
+ gr.Markdown("### 📊 보이스뱅크 정보")
576
+ if USE_UTAU:
577
+ compression_info = utau_engine.get_compression_info()
578
+ gr.Markdown(f"""
579
+ - **보이스뱅크:** hanseol CVC (압축된 HDF5 🗜️)
580
+ - **CV:** KUNGOM
581
+ - **UTAU:** KITANE 백한설
582
+ - **사용 가능한 음소:** {len(utau_engine.get_available_phonemes())}개
583
+ - **압축율:** {compression_info.get('compression_ratio', 0):.1f}%
584
+ - **용량:** {compression_info.get('compressed_size_bytes', 0) / (1024*1024):.1f} MB""")
585
+ else:
586
+ gr.Markdown("""
587
+ - **보이스뱅크:** ❌ 압축된 보이스뱅크 없음
588
+ - **상태:** 제한된 모드로 실행 중
589
+ - **해결책:** `make compress` 명령어로 보이스뱅크를 먼저 압축하세요.""")
590
+
591
+
592
+ # 이벤트 핸들러
593
+ synthesis_btn.click(
594
+ fn=synthesize_notes,
595
+ inputs=[
596
+ pianoroll,
597
+ engine_radio
598
+ ],
599
+ outputs=[pianoroll, audio_output, status_text]
600
+ )
601
+
602
+ def clear_pianoroll():
603
+ """피아노롤 초기화 - demo/app.py와 동일한 방식"""
604
+ initial_data = {
605
+ "notes": [],
606
+ "tempo": 120,
607
+ "timeSignature": {"numerator": 4, "denominator": 4},
608
+ "editMode": "select",
609
+ "snapSetting": "1/4",
610
+ "pixelsPerBeat": 80, # demo/app.py와 동일
611
+ "curve_data": {},
612
+ "use_backend_audio": True # demo/app.py와 동일하게 True 유지
613
+ }
614
+ print("🗑️ 피아노롤 초기화됨")
615
+ return initial_data
616
+
617
+ clear_btn.click(
618
+ fn=clear_pianoroll,
619
+ outputs=[pianoroll]
620
+ )
621
+
622
+ example_btn.click(
623
+ fn=create_example_melody,
624
+ outputs=[pianoroll]
625
+ )
626
+
627
+ # 초기 설정
628
+ create_test_voice_sample()
629
+
630
+ # playhead 동작을 위한 이벤트 핸들러 추가
631
+ def log_play_event(event_data=None):
632
+ print("🔊 Play event triggered:", event_data)
633
+ return f"재생 시작: {event_data if event_data else '재생 중'}"
634
+
635
+ def log_pause_event(event_data=None):
636
+ print("🔊 Pause event triggered:", event_data)
637
+ return f"일시정지: {event_data if event_data else '일시정지됨'}"
638
+
639
+ def log_stop_event(event_data=None):
640
+ print("🔊 Stop event triggered:", event_data)
641
+ return f"정지: {event_data if event_data else '정지됨'}"
642
+
643
+ # playhead 이벤트 핸들러 연결
644
+ pianoroll.play(log_play_event, outputs=status_text)
645
+ pianoroll.pause(log_pause_event, outputs=status_text)
646
+ pianoroll.stop(log_stop_event, outputs=status_text)
647
+
648
+
649
+ if __name__ == "__main__":
650
+ demo.launch()
compressed_utau_engine.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional, Tuple, Union
6
+ import logging
7
+ from straycat import Resampler
8
+ from voice_data_converter import CompressedVoicebankManager, OtoEntry
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class CompressedUTAUEngine:
13
+ """압축된 HDF5 보이스뱅크를 사용하는 UTAU 호환 음성 합성 엔진"""
14
+
15
+ def __init__(self, compressed_voicebank_path: Union[str, Path]):
16
+ self.voicebank = CompressedVoicebankManager(compressed_voicebank_path)
17
+ self.default_phoneme = "あ" # 기본 음소
18
+ logger.info(f"압축된 UTAU 엔진 초기화 완료")
19
+
20
+ def synthesize_sequence(self,
21
+ notes: List[Dict],
22
+ lyrics: List[str],
23
+ tempo: int = 120,
24
+ volume: int = 100) -> Tuple[Optional[str], str]:
25
+ """노트 시퀀스와 가사로 음성 합성"""
26
+
27
+ if len(notes) != len(lyrics):
28
+ return None, "노트와 가사의 개수가 일치하지 않습니다."
29
+
30
+ if not notes:
31
+ return None, "합성할 노트가 없습니다."
32
+
33
+ try:
34
+ # 전체 시퀀스 길이 계산
35
+ max_end_time = max(note.get('endSeconds',
36
+ note.get('startSeconds', 0) + note.get('durationSeconds', 0.5))
37
+ for note in notes)
38
+
39
+ sample_rate = 44100
40
+ total_samples = int(max_end_time * sample_rate) + sample_rate
41
+ final_audio = np.zeros(total_samples)
42
+
43
+ # 각 노트 합성
44
+ for i, (note, lyric) in enumerate(zip(notes, lyrics)):
45
+ try:
46
+ # 음소 변환
47
+ phoneme = self._lyric_to_phoneme(lyric)
48
+
49
+ # oto 엔트리 찾기
50
+ oto_entry = self.voicebank.get_sample_for_phoneme(phoneme)
51
+ if not oto_entry:
52
+ logger.warning(f"음소 '{phoneme}'에 해당하는 샘플을 찾을 수 없음")
53
+ continue
54
+
55
+ # 오디오 데이터 로드 (압축된 데이터에서)
56
+ audio_result = self.voicebank.get_audio_data(oto_entry.filename)
57
+ if not audio_result:
58
+ logger.warning(f"오디오 파일 로드 실패: {oto_entry.filename}")
59
+ continue
60
+
61
+ source_audio, source_sample_rate = audio_result
62
+
63
+ # 노트 합성
64
+ synth_audio = self._synthesize_note(
65
+ note, oto_entry, source_audio, source_sample_rate, tempo, volume
66
+ )
67
+
68
+ if synth_audio is not None:
69
+ # 시간 위치 계산 및 오디오 배치
70
+ start_sample = int(note.get('startSeconds', 0) * sample_rate)
71
+ end_sample = start_sample + len(synth_audio)
72
+
73
+ if end_sample <= len(final_audio):
74
+ final_audio[start_sample:end_sample] += synth_audio * (note.get('velocity', 100) / 100)
75
+ else:
76
+ # 버퍼 확장
77
+ new_size = end_sample + sample_rate
78
+ new_final_audio = np.zeros(new_size)
79
+ new_final_audio[:len(final_audio)] = final_audio
80
+ new_final_audio[start_sample:end_sample] += synth_audio * (note.get('velocity', 100) / 100)
81
+ final_audio = new_final_audio
82
+
83
+ logger.info(f"노트 {i+1} 합성 완료: {phoneme}")
84
+
85
+ except Exception as e:
86
+ logger.error(f"노트 {i+1} 합성 실패: {e}")
87
+ continue
88
+
89
+ # 최종 오디오 정규화
90
+ if np.max(np.abs(final_audio)) > 0:
91
+ final_audio = final_audio / np.max(np.abs(final_audio)) * 0.85
92
+
93
+ # 임시 파일 저장
94
+ output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
95
+ sf.write(output_file.name, final_audio, sample_rate)
96
+ output_file.close()
97
+
98
+ duration_sec = len(final_audio) / sample_rate
99
+ return output_file.name, f"✅ 압축된 보이스뱅크로 합성 완료: {len(notes)}개 노트, {duration_sec:.1f}초"
100
+
101
+ except Exception as e:
102
+ logger.error(f"시퀀스 합성 실패: {e}")
103
+ return None, f"❌ 합성 실패: {str(e)}"
104
+
105
+ def _lyric_to_phoneme(self, lyric: str) -> str:
106
+ """가사를 음소로 변환 (기존 로직과 동일)"""
107
+ lyric = lyric.strip()
108
+ if not lyric:
109
+ return self.default_phoneme
110
+
111
+ # 한글 → 일본어 음소 변환 (간단한 매핑)
112
+ hangul_to_japanese = {
113
+ '가': 'ka', '나': 'na', '다': 'da', '라': 'ra', '마': 'ma',
114
+ '바': 'ba', '사': 'sa', '아': 'a', '자': 'za', '차': 'cha',
115
+ '카': 'ka', '타': 'ta', '파': 'pa', '하': 'ha',
116
+ '거': 'ke', '너': 'ne', '더': 'de', '러': 're', '머': 'me',
117
+ '버': 'be', '서': 'se', '어': 'e', '저': 'ze', '처': 'che',
118
+ '커': 'ke', '터': 'te', '퍼': 'pe', '허': 'he',
119
+ '고': 'ko', '노': 'no', '도': 'do', '로': 'ro', '모': 'mo',
120
+ '보': 'bo', '소': 'so', '오': 'o', '조': 'zo', '초': 'cho',
121
+ '코': 'ko', '토': 'to', '포': 'po', '호': 'ho',
122
+ '구': 'ku', '누': 'nu', '두': 'du', '루': 'ru', '무': 'mu',
123
+ '부': 'bu', '수': 'su', '우': 'u', '주': 'zu', '추': 'chu',
124
+ '쿠': 'ku', '투': 'tu', '푸': 'pu', '후': 'hu',
125
+ '기': 'ki', '니': 'ni', '디': 'di', '리': 'ri', '미': 'mi',
126
+ '비': 'bi', '시': 'si', '이': 'i', '지': 'zi', '치': 'chi',
127
+ '키': 'ki', '티': 'ti', '피': 'pi', '히': 'hi',
128
+ '도': 'do', '레': 're', '미': 'mi', '파': 'pa', '솔': 'so', '라': 'ra', '시': 'si'
129
+ }
130
+
131
+ if lyric in hangul_to_japanese:
132
+ return hangul_to_japanese[lyric]
133
+
134
+ return lyric if lyric in self.voicebank.oto_entries else self.default_phoneme
135
+
136
+ def _synthesize_note(self,
137
+ note: Dict,
138
+ oto_entry: OtoEntry,
139
+ source_audio: np.ndarray,
140
+ source_sample_rate: int,
141
+ tempo: int,
142
+ volume: int) -> Optional[np.ndarray]:
143
+ """개별 노트 합성 (압축된 오디오 데이터 사용)"""
144
+
145
+ try:
146
+ # 임시 파일에 원본 오디오 저장
147
+ temp_input = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
148
+ sf.write(temp_input.name, source_audio, source_sample_rate)
149
+ temp_input.close()
150
+
151
+ # 출력 파일
152
+ temp_output = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
153
+ temp_output.close()
154
+
155
+ # 노트 정보 추출
156
+ pitch = note['pitch']
157
+ duration_ms = note.get('durationSeconds', 0.5) * 1000
158
+ velocity = note.get('velocity', 100)
159
+
160
+ # MIDI 노트를 음계명으로 변환
161
+ note_name = self._midi_to_note_name(pitch)
162
+
163
+ # straycat Resampler로 합성
164
+ resampler = Resampler(
165
+ in_file=temp_input.name,
166
+ out_file=temp_output.name,
167
+ pitch=note_name,
168
+ velocity=velocity,
169
+ length=max(duration_ms, 200), # 최소 200ms
170
+ volume=volume,
171
+ offset=oto_entry.offset,
172
+ consonant=oto_entry.consonant,
173
+ cutoff=oto_entry.cutoff,
174
+ modulation=10,
175
+ tempo=f'!{tempo}'
176
+ )
177
+
178
+ # 합성된 오디오 로드
179
+ if Path(temp_output.name).exists():
180
+ synth_audio, _ = sf.read(temp_output.name)
181
+
182
+ # 정리
183
+ Path(temp_input.name).unlink(missing_ok=True)
184
+ Path(temp_output.name).unlink(missing_ok=True)
185
+
186
+ return synth_audio
187
+ else:
188
+ logger.error(f"합성된 파일이 생성되지 않음: {temp_output.name}")
189
+ return None
190
+
191
+ except Exception as e:
192
+ logger.error(f"노트 합성 실패: {e}")
193
+ return None
194
+
195
+ def _midi_to_note_name(self, midi_note: int) -> str:
196
+ """MIDI 노트 번호를 음계명으로 변환"""
197
+ notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
198
+ octave = (midi_note // 12) - 1
199
+ note = notes[midi_note % 12]
200
+ return f"{note}{octave}"
201
+
202
+ def get_available_phonemes(self) -> List[str]:
203
+ """사용 가능한 음소 목록 반환"""
204
+ return self.voicebank.list_available_phonemes()
205
+
206
+ def get_compression_info(self) -> Dict[str, any]:
207
+ """압축 정보 반환"""
208
+ return self.voicebank.get_compression_info()
pyproject.toml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "utau-webui"
3
+ version = "0.1.0"
4
+ description = "한국어 CVC 보이스뱅크를 사용한 웹 기반 UTAU 음성 합성기"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ authors = [
8
+ { name = "UTAU WebUI Team" }
9
+ ]
10
+ keywords = ["utau", "voice-synthesis", "korean", "music", "audio"]
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "Intended Audience :: End Users/Desktop",
14
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.12",
18
+ ]
19
+ dependencies = [
20
+ "gradio>=5.33.1",
21
+ "gradio-pianoroll>=0.0.8",
22
+ "h5py>=3.10.0",
23
+ "librosa>=0.11.0",
24
+ "llvmlite>=0.44.0",
25
+ "numba>=0.61.2",
26
+ "numpy>=2.2.0",
27
+ "pyworld>=0.3.5",
28
+ "resampy>=0.4.3",
29
+ "scipy>=1.15.3",
30
+ "setuptools>=80.9.0",
31
+ "soundfile>=0.12.1",
32
+ ]
requirements.txt ADDED
@@ -0,0 +1,809 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv export -o requirements.txt
3
+ aiofiles==24.1.0 \
4
+ --hash=sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c \
5
+ --hash=sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5
6
+ # via gradio
7
+ annotated-types==0.7.0 \
8
+ --hash=sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53 \
9
+ --hash=sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89
10
+ # via pydantic
11
+ anyio==4.9.0 \
12
+ --hash=sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028 \
13
+ --hash=sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c
14
+ # via
15
+ # gradio
16
+ # httpx
17
+ # starlette
18
+ audioop-lts==0.2.1 ; python_full_version >= '3.13' \
19
+ --hash=sha256:05da64e73837f88ee5c6217d732d2584cf638003ac72df124740460531e95e47 \
20
+ --hash=sha256:120678b208cca1158f0a12d667af592e067f7a50df9adc4dc8f6ad8d065a93fb \
21
+ --hash=sha256:161249db9343b3c9780ca92c0be0d1ccbfecdbccac6844f3d0d44b9c4a00a17f \
22
+ --hash=sha256:2aeb6f96f7f6da80354330470b9134d81b4cf544cdd1c549f2f45fe964d28059 \
23
+ --hash=sha256:2bdb3b7912ccd57ea53197943f1bbc67262dcf29802c4a6df79ec1c715d45a78 \
24
+ --hash=sha256:3827e3fce6fee4d69d96a3d00cd2ab07f3c0d844cb1e44e26f719b34a5b15455 \
25
+ --hash=sha256:4a8dd6a81770f6ecf019c4b6d659e000dc26571b273953cef7cd1d5ce2ff3ae6 \
26
+ --hash=sha256:534ce808e6bab6adb65548723c8cbe189a3379245db89b9d555c4210b4aaa9b6 \
27
+ --hash=sha256:54cd4520fc830b23c7d223693ed3e1b4d464997dd3abc7c15dce9a1f9bd76ab2 \
28
+ --hash=sha256:56b7a0a4dba8e353436f31a932f3045d108a67b5943b30f85a5563f4d8488d77 \
29
+ --hash=sha256:5b7b4ff9de7a44e0ad2618afdc2ac920b91f4a6d3509520ee65339d4acde5abf \
30
+ --hash=sha256:64562c5c771fb0a8b6262829b9b4f37a7b886c01b4d3ecdbae1d629717db08b4 \
31
+ --hash=sha256:6e899eb8874dc2413b11926b5fb3857ec0ab55222840e38016a6ba2ea9b7d5e3 \
32
+ --hash=sha256:72e37f416adb43b0ced93419de0122b42753ee74e87070777b53c5d2241e7fab \
33
+ --hash=sha256:78bfb3703388c780edf900be66e07de5a3d4105ca8e8720c5c4d67927e0b15d0 \
34
+ --hash=sha256:a351af79edefc2a1bd2234bfd8b339935f389209943043913a919df4b0f13300 \
35
+ --hash=sha256:c45317debeb64002e980077642afbd977773a25fa3dfd7ed0c84dccfc1fafcb0 \
36
+ --hash=sha256:c589f06407e8340e81962575fcffbba1e92671879a221186c3d4662de9fe804e \
37
+ --hash=sha256:d1cd3c0b6f2ca25c7d2b1c3adeecbe23e65689839ba73331ebc7d893fcda7ffe \
38
+ --hash=sha256:d2d5434717f33117f29b5691fbdf142d36573d751716249a288fbb96ba26a281 \
39
+ --hash=sha256:d2de9b6fb8b1cf9f03990b299a9112bfdf8b86b6987003ca9e8a6c4f56d39543 \
40
+ --hash=sha256:d6bd20c7a10abcb0fb3d8aaa7508c0bf3d40dfad7515c572014da4b979d3310a \
41
+ --hash=sha256:e175350da05d2087e12cea8e72a70a1a8b14a17e92ed2022952a4419689ede5e \
42
+ --hash=sha256:e1af3ff32b8c38a7d900382646e91f2fc515fd19dea37e9392275a5cbfdbff63 \
43
+ --hash=sha256:e81268da0baa880431b68b1308ab7257eb33f356e57a5f9b1f915dfb13dd1387 \
44
+ --hash=sha256:f0ed1ad9bd862539ea875fb339ecb18fcc4148f8d9908f4502df28f94d23491a \
45
+ --hash=sha256:f0f2f336aa2aee2bce0b0dcc32bbba9178995454c7b979cf6ce086a8801e14c7 \
46
+ --hash=sha256:f24865991b5ed4b038add5edbf424639d1358144f4e2a3e7a84bc6ba23e35074 \
47
+ --hash=sha256:f51bb55122a89f7a0817d7ac2319744b4640b5b446c4c3efcea5764ea99ae509 \
48
+ --hash=sha256:f626a01c0a186b08f7ff61431c01c055961ee28769591efa8800beadd27a2959 \
49
+ --hash=sha256:fbae5d6925d7c26e712f0beda5ed69ebb40e14212c185d129b8dfbfcc335eb48 \
50
+ --hash=sha256:fd1345ae99e17e6910f47ce7d52673c6a1a70820d78b67de1b7abb3af29c426a \
51
+ --hash=sha256:ff3f97b3372c97782e9c6d3d7fdbe83bce8f70de719605bd7ee1839cd1ab360a
52
+ # via
53
+ # gradio
54
+ # standard-aifc
55
+ # standard-sunau
56
+ audioread==3.0.1 \
57
+ --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
58
+ --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
59
+ # via librosa
60
+ certifi==2025.4.26 \
61
+ --hash=sha256:0a816057ea3cdefcef70270d2c515e4506bbc954f417fa5ade2021213bb8f0c6 \
62
+ --hash=sha256:30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3
63
+ # via
64
+ # httpcore
65
+ # httpx
66
+ # requests
67
+ cffi==1.17.1 \
68
+ --hash=sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2 \
69
+ --hash=sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36 \
70
+ --hash=sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824 \
71
+ --hash=sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3 \
72
+ --hash=sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed \
73
+ --hash=sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8 \
74
+ --hash=sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903 \
75
+ --hash=sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683 \
76
+ --hash=sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9 \
77
+ --hash=sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c \
78
+ --hash=sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4 \
79
+ --hash=sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65 \
80
+ --hash=sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93 \
81
+ --hash=sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4 \
82
+ --hash=sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3 \
83
+ --hash=sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff \
84
+ --hash=sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5 \
85
+ --hash=sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd \
86
+ --hash=sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5 \
87
+ --hash=sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d \
88
+ --hash=sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e \
89
+ --hash=sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a \
90
+ --hash=sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99
91
+ # via soundfile
92
+ charset-normalizer==3.4.2 \
93
+ --hash=sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7 \
94
+ --hash=sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0 \
95
+ --hash=sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b \
96
+ --hash=sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff \
97
+ --hash=sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e \
98
+ --hash=sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148 \
99
+ --hash=sha256:4e594135de17ab3866138f496755f302b72157d115086d100c3f19370839dd3a \
100
+ --hash=sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e \
101
+ --hash=sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63 \
102
+ --hash=sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c \
103
+ --hash=sha256:7222ffd5e4de8e57e03ce2cef95a4c43c98fcb72ad86909abdfc2c17d227fc1b \
104
+ --hash=sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0 \
105
+ --hash=sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0 \
106
+ --hash=sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1 \
107
+ --hash=sha256:a370b3e078e418187da8c3674eddb9d983ec09445c99a3a263c2011993522981 \
108
+ --hash=sha256:a955b438e62efdf7e0b7b52a64dc5c3396e2634baa62471768a64bc2adb73d5c \
109
+ --hash=sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980 \
110
+ --hash=sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7 \
111
+ --hash=sha256:bee093bf902e1d8fc0ac143c88902c3dfc8941f7ea1d6a8dd2bcb786d33db03d \
112
+ --hash=sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3 \
113
+ --hash=sha256:cf713fe9a71ef6fd5adf7a79670135081cd4431c2943864757f0fa3a65b1fafd \
114
+ --hash=sha256:d41c4d287cfc69060fa91cae9683eacffad989f1a10811995fa309df656ec214 \
115
+ --hash=sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c \
116
+ --hash=sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f \
117
+ --hash=sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691 \
118
+ --hash=sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf \
119
+ --hash=sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b \
120
+ --hash=sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a
121
+ # via requests
122
+ click==8.2.1 ; sys_platform != 'emscripten' \
123
+ --hash=sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202 \
124
+ --hash=sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b
125
+ # via
126
+ # typer
127
+ # uvicorn
128
+ colorama==0.4.6 ; sys_platform == 'win32' \
129
+ --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \
130
+ --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6
131
+ # via
132
+ # click
133
+ # tqdm
134
+ decorator==5.2.1 \
135
+ --hash=sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360 \
136
+ --hash=sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a
137
+ # via librosa
138
+ fastapi==0.115.12 \
139
+ --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \
140
+ --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d
141
+ # via gradio
142
+ ffmpy==0.6.0 \
143
+ --hash=sha256:332dd93198a162db61e527e866a04578d3713e577bfe68f2ed26ba9d09dbc948 \
144
+ --hash=sha256:c8369bf45f8bd5285ebad94c4a789a79e7af86eded74c1f8c36eccf57aaea58c
145
+ # via gradio
146
+ filelock==3.18.0 \
147
+ --hash=sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2 \
148
+ --hash=sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de
149
+ # via huggingface-hub
150
+ fsspec==2025.5.1 \
151
+ --hash=sha256:24d3a2e663d5fc735ab256263c4075f374a174c3410c0b25e5bd1970bceaa462 \
152
+ --hash=sha256:2e55e47a540b91843b755e83ded97c6e897fa0942b11490113f09e9c443c2475
153
+ # via
154
+ # gradio-client
155
+ # huggingface-hub
156
+ gradio==5.33.1 \
157
+ --hash=sha256:c4329b04280d62041fbf0113e94fb5c4d20e0555ce1ac69174bf98225350159b \
158
+ --hash=sha256:f74c737aa92fc02b4d7dca7e50ee13ddce548aa16c9fcbe907ceabf93722f94d
159
+ # via
160
+ # gradio-pianoroll
161
+ # utau-webui
162
+ gradio-client==1.10.3 \
163
+ --hash=sha256:941e7f8d9a160f88487e9780a3db2736a40ea2b8b69d53ffdb306e47ef658b76 \
164
+ --hash=sha256:9e99b88e47f05dc3b68e40a3f3f83819f8d0ddcd43466ad385fe42e137825774
165
+ # via gradio
166
+ gradio-pianoroll==0.0.8 \
167
+ --hash=sha256:26abd2c98ccb8bb30e8269324ca8675109a502e266c4e9c8bfff524d1a9c0218 \
168
+ --hash=sha256:f7ac6d27dab2873c35bba5041b94afc9159922be2a9cebe202e8a87f4ec79e86
169
+ # via utau-webui
170
+ groovy==0.1.2 \
171
+ --hash=sha256:25c1dc09b3f9d7e292458aa762c6beb96ea037071bf5e917fc81fb78d2231083 \
172
+ --hash=sha256:7f7975bab18c729a257a8b1ae9dcd70b7cafb1720481beae47719af57c35fa64
173
+ # via gradio
174
+ h11==0.16.0 \
175
+ --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \
176
+ --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86
177
+ # via
178
+ # httpcore
179
+ # uvicorn
180
+ h5py==3.14.0 \
181
+ --hash=sha256:0cbd41f4e3761f150aa5b662df991868ca533872c95467216f2bec5fcad84882 \
182
+ --hash=sha256:2372116b2e0d5d3e5e705b7f663f7c8d96fa79a4052d250484ef91d24d6a08f4 \
183
+ --hash=sha256:554ef0ced3571366d4d383427c00c966c360e178b5fb5ee5bb31a435c424db0c \
184
+ --hash=sha256:6da62509b7e1d71a7d110478aa25d245dd32c8d9a1daee9d2a42dba8717b047a \
185
+ --hash=sha256:aa4b7bbce683379b7bf80aaba68e17e23396100336a8d500206520052be2f812 \
186
+ --hash=sha256:ae18e3de237a7a830adb76aaa68ad438d85fe6e19e0d99944a3ce46b772c69b3 \
187
+ --hash=sha256:bf4897d67e613ecf5bdfbdab39a1158a64df105827da70ea1d90243d796d367f \
188
+ --hash=sha256:d90e6445ab7c146d7f7981b11895d70bc1dd91278a4f9f9028bc0c95e4a53f13 \
189
+ --hash=sha256:e0045115d83272090b0717c555a31398c2c089b87d212ceba800d3dc5d952e23 \
190
+ --hash=sha256:e8cbaf6910fa3983c46172666b0b8da7b7bd90d764399ca983236f2400436eeb \
191
+ --hash=sha256:ef9603a501a04fcd0ba28dd8f0995303d26a77a980a1f9474b3417543d4c6174
192
+ # via utau-webui
193
+ hf-xet==1.1.3 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
194
+ --hash=sha256:30c575a5306f8e6fda37edb866762140a435037365eba7a17ce7bd0bc0216a8b \
195
+ --hash=sha256:7c1a6aa6abed1f696f8099aa9796ca04c9ee778a58728a115607de9cc4638ff1 \
196
+ --hash=sha256:8203f52827e3df65981984936654a5b390566336956f65765a8aa58c362bb841 \
197
+ --hash=sha256:a5f09b1dd24e6ff6bcedb4b0ddab2d81824098bb002cf8b4ffa780545fa348c3 \
198
+ --hash=sha256:b578ae5ac9c056296bb0df9d018e597c8dc6390c5266f35b5c44696003cde9f3 \
199
+ --hash=sha256:b788a61977fbe6b5186e66239e2a329a3f0b7e7ff50dad38984c0c74f44aeca1 \
200
+ --hash=sha256:c3b508b5f583a75641aebf732853deb058953370ce8184f5dabc49f803b0819b \
201
+ --hash=sha256:fd2da210856444a34aad8ada2fc12f70dabed7cc20f37e90754d1d9b43bc0534
202
+ # via huggingface-hub
203
+ httpcore==1.0.9 \
204
+ --hash=sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55 \
205
+ --hash=sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8
206
+ # via httpx
207
+ httpx==0.28.1 \
208
+ --hash=sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc \
209
+ --hash=sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad
210
+ # via
211
+ # gradio
212
+ # gradio-client
213
+ # safehttpx
214
+ huggingface-hub==0.32.6 \
215
+ --hash=sha256:32cde9558c965477556edca72352621def7fbc42e167aaf33f4cdb9af65bb28b \
216
+ --hash=sha256:8e960f23dc57519c6c2a0bbc7e9bc030eaa14e7f2d61f8e68fd3d025dabed2fa
217
+ # via
218
+ # gradio
219
+ # gradio-client
220
+ idna==3.10 \
221
+ --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
222
+ --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
223
+ # via
224
+ # anyio
225
+ # httpx
226
+ # requests
227
+ jinja2==3.1.6 \
228
+ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \
229
+ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67
230
+ # via gradio
231
+ joblib==1.5.1 \
232
+ --hash=sha256:4719a31f054c7d766948dcd83e9613686b27114f190f717cec7eaa2084f8a74a \
233
+ --hash=sha256:f4f86e351f39fe3d0d32a9f2c3d8af1ee4cec285aafcb27003dda5205576b444
234
+ # via
235
+ # librosa
236
+ # scikit-learn
237
+ lazy-loader==0.4 \
238
+ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \
239
+ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
240
+ # via librosa
241
+ librosa==0.11.0 \
242
+ --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
243
+ --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
244
+ # via utau-webui
245
+ llvmlite==0.44.0 \
246
+ --hash=sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4 \
247
+ --hash=sha256:1d671a56acf725bf1b531d5ef76b86660a5ab8ef19bb6a46064a705c6ca80aad \
248
+ --hash=sha256:2fb7c4f2fb86cbae6dca3db9ab203eeea0e22d73b99bc2341cdf9de93612e930 \
249
+ --hash=sha256:319bddd44e5f71ae2689859b7203080716448a3cd1128fb144fe5c055219d516 \
250
+ --hash=sha256:46224058b13c96af1365290bdfebe9a6264ae62fb79b2b55693deed11657a8bf \
251
+ --hash=sha256:5f79a728e0435493611c9f405168682bb75ffd1fbe6fc360733b850c80a026db \
252
+ --hash=sha256:9c58867118bad04a0bb22a2e0068c693719658105e40009ffe95c7000fcde88e \
253
+ --hash=sha256:aa0097052c32bf721a4efc03bd109d335dfa57d9bffb3d4c24cc680711b8b4fc \
254
+ --hash=sha256:c0143a5ef336da14deaa8ec26c5449ad5b6a2b564df82fcef4be040b9cacfea9 \
255
+ --hash=sha256:d752f89e31b66db6f8da06df8b39f9b91e78c5feea1bf9e8c1fba1d1c24c065d \
256
+ --hash=sha256:eae7e2d4ca8f88f89d315b48c6b741dcb925d6a1042da694aa16ab3dd4cbd3a1
257
+ # via
258
+ # numba
259
+ # utau-webui
260
+ markdown-it-py==3.0.0 ; sys_platform != 'emscripten' \
261
+ --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
262
+ --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
263
+ # via rich
264
+ markupsafe==3.0.2 \
265
+ --hash=sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30 \
266
+ --hash=sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9 \
267
+ --hash=sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 \
268
+ --hash=sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028 \
269
+ --hash=sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557 \
270
+ --hash=sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a \
271
+ --hash=sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c \
272
+ --hash=sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c \
273
+ --hash=sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22 \
274
+ --hash=sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094 \
275
+ --hash=sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5 \
276
+ --hash=sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225 \
277
+ --hash=sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c \
278
+ --hash=sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87 \
279
+ --hash=sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf \
280
+ --hash=sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb \
281
+ --hash=sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48 \
282
+ --hash=sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c \
283
+ --hash=sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6 \
284
+ --hash=sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd \
285
+ --hash=sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1 \
286
+ --hash=sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d \
287
+ --hash=sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca \
288
+ --hash=sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a \
289
+ --hash=sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe \
290
+ --hash=sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8 \
291
+ --hash=sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f \
292
+ --hash=sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f \
293
+ --hash=sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0 \
294
+ --hash=sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79 \
295
+ --hash=sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430
296
+ # via
297
+ # gradio
298
+ # jinja2
299
+ mdurl==0.1.2 ; sys_platform != 'emscripten' \
300
+ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
301
+ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
302
+ # via markdown-it-py
303
+ msgpack==1.1.0 \
304
+ --hash=sha256:06f5fd2f6bb2a7914922d935d3b8bb4a7fff3a9a91cfce6d06c13bc42bec975b \
305
+ --hash=sha256:071603e2f0771c45ad9bc65719291c568d4edf120b44eb36324dcb02a13bfddf \
306
+ --hash=sha256:0907e1a7119b337971a689153665764adc34e89175f9a34793307d9def08e6ca \
307
+ --hash=sha256:0f92a83b84e7c0749e3f12821949d79485971f087604178026085f60ce109330 \
308
+ --hash=sha256:115a7af8ee9e8cddc10f87636767857e7e3717b7a2e97379dc2054712693e90f \
309
+ --hash=sha256:17fb65dd0bec285907f68b15734a993ad3fc94332b5bb21b0435846228de1f39 \
310
+ --hash=sha256:2137773500afa5494a61b1208619e3871f75f27b03bcfca7b3a7023284140247 \
311
+ --hash=sha256:398b713459fea610861c8a7b62a6fec1882759f308ae0795b5413ff6a160cf3c \
312
+ --hash=sha256:4676e5be1b472909b2ee6356ff425ebedf5142427842aa06b4dfd5117d1ca8a2 \
313
+ --hash=sha256:4a1964df7b81285d00a84da4e70cb1383f2e665e0f1f2a7027e683956d04b734 \
314
+ --hash=sha256:4b51405e36e075193bc051315dbf29168d6141ae2500ba8cd80a522964e31434 \
315
+ --hash=sha256:58dfc47f8b102da61e8949708b3eafc3504509a5728f8b4ddef84bd9e16ad420 \
316
+ --hash=sha256:59caf6a4ed0d164055ccff8fe31eddc0ebc07cf7326a2aaa0dbf7a4001cd823e \
317
+ --hash=sha256:5dbad74103df937e1325cc4bfeaf57713be0b4f15e1c2da43ccdd836393e2ea2 \
318
+ --hash=sha256:65553c9b6da8166e819a6aa90ad15288599b340f91d18f60b2061f402b9a4915 \
319
+ --hash=sha256:7a946a8992941fea80ed4beae6bff74ffd7ee129a90b4dd5cf9c476a30e9708d \
320
+ --hash=sha256:7c9a35ce2c2573bada929e0b7b3576de647b0defbd25f5139dcdaba0ae35a4cc \
321
+ --hash=sha256:a51abd48c6d8ac89e0cfd4fe177c61481aca2d5e7ba42044fd218cfd8ea9899f \
322
+ --hash=sha256:ad33e8400e4ec17ba782f7b9cf868977d867ed784a1f5f2ab46e7ba53b6e1e1b \
323
+ --hash=sha256:b4c01941fd2ff87c2a934ee6055bda4ed353a7846b8d4f341c428109e9fcde8c \
324
+ --hash=sha256:bce7d9e614a04d0883af0b3d4d501171fbfca038f12c77fa838d9f198147a23f \
325
+ --hash=sha256:d46cf9e3705ea9485687aa4001a76e44748b609d260af21c4ceea7f2212a501d \
326
+ --hash=sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e
327
+ # via librosa
328
+ numba==0.61.2 \
329
+ --hash=sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2 \
330
+ --hash=sha256:3a10a8fc9afac40b1eac55717cece1b8b1ac0b946f5065c89e00bde646b5b154 \
331
+ --hash=sha256:48a53a3de8f8793526cbe330f2a39fe9a6638efcbf11bd63f3d2f9757ae345cd \
332
+ --hash=sha256:4ddce10009bc097b080fc96876d14c051cc0c7679e99de3e0af59014dab7dfe8 \
333
+ --hash=sha256:59321215e2e0ac5fa928a8020ab00b8e57cda8a97384963ac0dfa4d4e6aa54e7 \
334
+ --hash=sha256:5b1bb509d01f23d70325d3a5a0e237cbc9544dd50e50588bc581ba860c213546 \
335
+ --hash=sha256:5f154aaea625fb32cfbe3b80c5456d514d416fcdf79733dd69c0df3a11348e9e \
336
+ --hash=sha256:7d3bcada3c9afba3bed413fba45845f2fb9cd0d2b27dd58a1be90257e293d140 \
337
+ --hash=sha256:8750ee147940a6637b80ecf7f95062185ad8726c8c28a2295b8ec1160a196f7d \
338
+ --hash=sha256:97cf4f12c728cf77c9c1d7c23707e4d8fb4632b46275f8f3397de33e5877af18 \
339
+ --hash=sha256:bdbca73ad81fa196bd53dc12e3aaf1564ae036e0c125f237c7644fe64a4928ab
340
+ # via
341
+ # librosa
342
+ # resampy
343
+ # utau-webui
344
+ numpy==2.2.6 \
345
+ --hash=sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff \
346
+ --hash=sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84 \
347
+ --hash=sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6 \
348
+ --hash=sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f \
349
+ --hash=sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b \
350
+ --hash=sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49 \
351
+ --hash=sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571 \
352
+ --hash=sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff \
353
+ --hash=sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4 \
354
+ --hash=sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566 \
355
+ --hash=sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40 \
356
+ --hash=sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd \
357
+ --hash=sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06 \
358
+ --hash=sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282 \
359
+ --hash=sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3 \
360
+ --hash=sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1 \
361
+ --hash=sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c \
362
+ --hash=sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d \
363
+ --hash=sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2 \
364
+ --hash=sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c \
365
+ --hash=sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f \
366
+ --hash=sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd \
367
+ --hash=sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868 \
368
+ --hash=sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d \
369
+ --hash=sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87 \
370
+ --hash=sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa \
371
+ --hash=sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f \
372
+ --hash=sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda \
373
+ --hash=sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249 \
374
+ --hash=sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de \
375
+ --hash=sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8
376
+ # via
377
+ # gradio
378
+ # h5py
379
+ # librosa
380
+ # numba
381
+ # pandas
382
+ # pyworld
383
+ # resampy
384
+ # scikit-learn
385
+ # scipy
386
+ # soundfile
387
+ # soxr
388
+ # utau-webui
389
+ orjson==3.10.18 \
390
+ --hash=sha256:0315317601149c244cb3ecef246ef5861a64824ccbcb8018d32c66a60a84ffbc \
391
+ --hash=sha256:187ec33bbec58c76dbd4066340067d9ece6e10067bb0cc074a21ae3300caa84e \
392
+ --hash=sha256:1ebeda919725f9dbdb269f59bc94f861afbe2a27dce5608cdba2d92772364d1c \
393
+ --hash=sha256:22748de2a07fcc8781a70edb887abf801bb6142e6236123ff93d12d92db3d406 \
394
+ --hash=sha256:2d808e34ddb24fc29a4d4041dcfafbae13e129c93509b847b14432717d94b44f \
395
+ --hash=sha256:303565c67a6c7b1f194c94632a4a39918e067bd6176a48bec697393865ce4f06 \
396
+ --hash=sha256:356b076f1662c9813d5fa56db7d63ccceef4c271b1fb3dd522aca291375fcf17 \
397
+ --hash=sha256:3a83c9954a4107b9acd10291b7f12a6b29e35e8d43a414799906ea10e75438e6 \
398
+ --hash=sha256:3d600be83fe4514944500fa8c2a0a77099025ec6482e8087d7659e891f23058a \
399
+ --hash=sha256:50c15557afb7f6d63bc6d6348e0337a880a04eaa9cd7c9d569bcb4e760a24753 \
400
+ --hash=sha256:559eb40a70a7494cd5beab2d73657262a74a2c59aff2068fdba8f0424ec5b39d \
401
+ --hash=sha256:5adf5f4eed520a4959d29ea80192fa626ab9a20b2ea13f8f6dc58644f6927103 \
402
+ --hash=sha256:6612787e5b0756a171c7d81ba245ef63a3533a637c335aa7fcb8e665f4a0966f \
403
+ --hash=sha256:69c34b9441b863175cc6a01f2935de994025e773f814412030f269da4f7be147 \
404
+ --hash=sha256:7592bb48a214e18cd670974f289520f12b7aed1fa0b2e2616b8ed9e069e08595 \
405
+ --hash=sha256:7ac6bd7be0dcab5b702c9d43d25e70eb456dfd2e119d512447468f6405b4a69c \
406
+ --hash=sha256:86314fdb5053a2f5a5d881f03fca0219bfdf832912aa88d18676a5175c6916b5 \
407
+ --hash=sha256:8e4b2ae732431127171b875cb2668f883e1234711d3c147ffd69fe5be51a8012 \
408
+ --hash=sha256:9dca85398d6d093dd41dc0983cbf54ab8e6afd1c547b6b8a311643917fbf4e0c \
409
+ --hash=sha256:9f72f100cee8dde70100406d5c1abba515a7df926d4ed81e20a9730c062fe9ad \
410
+ --hash=sha256:ad8eacbb5d904d5591f27dee4031e2c1db43d559edb8f91778efd642d70e6bea \
411
+ --hash=sha256:aed411bcb68bf62e85588f2a7e03a6082cc42e5a2796e06e72a962d7c6310b52 \
412
+ --hash=sha256:bb70d489bc79b7519e5803e2cc4c72343c9dc1154258adf2f8925d0b60da7c58 \
413
+ --hash=sha256:c382a5c0b5931a5fc5405053d36c1ce3fd561694738626c77ae0b1dfc0242ca1 \
414
+ --hash=sha256:e0da26957e77e9e55a6c2ce2e7182a36a6f6b180ab7189315cb0995ec362e049 \
415
+ --hash=sha256:e8da3947d92123eda795b68228cafe2724815621fe35e8e320a9e9593a4bcd53 \
416
+ --hash=sha256:e9e86a6af31b92299b00736c89caf63816f70a4001e750bda179e15564d7a034 \
417
+ --hash=sha256:f3c29eb9a81e2fbc6fd7ddcfba3e101ba92eaff455b8d602bf7511088bbc0eae \
418
+ --hash=sha256:f54c1385a0e6aba2f15a40d703b858bedad36ded0491e55d35d905b2c34a4cc3 \
419
+ --hash=sha256:f872bef9f042734110642b7a11937440797ace8c87527de25e0c53558b579ccc \
420
+ --hash=sha256:f9f94cf6d3f9cd720d641f8399e390e7411487e493962213390d1ae45c7814fc
421
+ # via gradio
422
+ packaging==25.0 \
423
+ --hash=sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 \
424
+ --hash=sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f
425
+ # via
426
+ # gradio
427
+ # gradio-client
428
+ # huggingface-hub
429
+ # lazy-loader
430
+ # pooch
431
+ pandas==2.3.0 \
432
+ --hash=sha256:094e271a15b579650ebf4c5155c05dcd2a14fd4fdd72cf4854b2f7ad31ea30be \
433
+ --hash=sha256:1a881bc1309f3fce34696d07b00f13335c41f5f5a8770a33b09ebe23261cfc67 \
434
+ --hash=sha256:1d2b33e68d0ce64e26a4acc2e72d747292084f4e8db4c847c6f5f6cbe56ed6d8 \
435
+ --hash=sha256:213cd63c43263dbb522c1f8a7c9d072e25900f6975596f883f4bebd77295d4f3 \
436
+ --hash=sha256:2c7e2fc25f89a49a11599ec1e76821322439d90820108309bf42130d2f36c983 \
437
+ --hash=sha256:2eb4728a18dcd2908c7fccf74a982e241b467d178724545a48d0caf534b38ebf \
438
+ --hash=sha256:34600ab34ebf1131a7613a260a61dbe8b62c188ec0ea4c296da7c9a06b004133 \
439
+ --hash=sha256:404d681c698e3c8a40a61d0cd9412cc7364ab9a9cc6e144ae2992e11a2e77a20 \
440
+ --hash=sha256:430a63bae10b5086995db1b02694996336e5a8ac9a96b4200572b413dfdfccb9 \
441
+ --hash=sha256:4930255e28ff5545e2ca404637bcc56f031893142773b3468dc021c6c32a1390 \
442
+ --hash=sha256:6021910b086b3ca756755e86ddc64e0ddafd5e58e076c72cb1585162e5ad259b \
443
+ --hash=sha256:951805d146922aed8357e4cc5671b8b0b9be1027f0619cea132a9f3f65f2f09c \
444
+ --hash=sha256:9ff730713d4c4f2f1c860e36c005c7cefc1c7c80c21c0688fd605aa43c9fcf09 \
445
+ --hash=sha256:b9d8c3187be7479ea5c3d30c32a5d73d62a621166675063b2edd21bc47614027 \
446
+ --hash=sha256:ba24af48643b12ffe49b27065d3babd52702d95ab70f50e1b34f71ca703e2c0d \
447
+ --hash=sha256:bb32dc743b52467d488e7a7c8039b821da2826a9ba4f85b89ea95274f863280f \
448
+ --hash=sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249 \
449
+ --hash=sha256:c6da97aeb6a6d233fb6b17986234cc723b396b50a3c6804776351994f2a658fd \
450
+ --hash=sha256:e1991bbb96f4050b09b5f811253c4f3cf05ee89a589379aa36cd623f21a31d6f \
451
+ --hash=sha256:e78ad363ddb873a631e92a3c063ade1ecfb34cae71e9a2be6ad100f875ac1042 \
452
+ --hash=sha256:f925f1ef673b4bd0271b1809b72b3270384f2b7d9d14a189b12b7fc02574d575
453
+ # via gradio
454
+ pillow==11.2.1 \
455
+ --hash=sha256:062b7a42d672c45a70fa1f8b43d1d38ff76b63421cbbe7f88146b39e8a558d91 \
456
+ --hash=sha256:0c7b29dbd4281923a2bfe562acb734cee96bbb129e96e6972d315ed9f232bef4 \
457
+ --hash=sha256:14e33b28bf17c7a38eede290f77db7c664e4eb01f7869e37fa98a5aa95978941 \
458
+ --hash=sha256:191955c55d8a712fab8934a42bfefbf99dd0b5875078240943f913bb66d46d9f \
459
+ --hash=sha256:1d535df14716e7f8776b9e7fee118576d65572b4aad3ed639be9e4fa88a1cad3 \
460
+ --hash=sha256:21e1470ac9e5739ff880c211fc3af01e3ae505859392bf65458c224d0bf283eb \
461
+ --hash=sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681 \
462
+ --hash=sha256:25a5f306095c6780c52e6bbb6109624b95c5b18e40aab1c3041da3e9e0cd3e2d \
463
+ --hash=sha256:31df6e2d3d8fc99f993fd253e97fae451a8db2e7207acf97859732273e108406 \
464
+ --hash=sha256:36d6b82164c39ce5482f649b437382c0fb2395eabc1e2b1702a6deb8ad647d6e \
465
+ --hash=sha256:3e645b020f3209a0181a418bffe7b4a93171eef6c4ef6cc20980b30bebf17b7d \
466
+ --hash=sha256:3fe735ced9a607fee4f481423a9c36701a39719252a9bb251679635f99d0f7d2 \
467
+ --hash=sha256:4eb92eca2711ef8be42fd3f67533765d9fd043b8c80db204f16c8ea62ee1a751 \
468
+ --hash=sha256:5119225c622403afb4b44bad4c1ca6c1f98eed79db8d3bc6e4e160fc6339d66c \
469
+ --hash=sha256:598174aef4589af795f66f9caab87ba4ff860ce08cd5bb447c6fc553ffee603c \
470
+ --hash=sha256:63b5dff3a68f371ea06025a1a6966c9a1e1ee452fc8020c2cd0ea41b83e9037b \
471
+ --hash=sha256:74ee3d7ecb3f3c05459ba95eed5efa28d6092d751ce9bf20e3e253a4e497e691 \
472
+ --hash=sha256:750f96efe0597382660d8b53e90dd1dd44568a8edb51cb7f9d5d918b80d4de14 \
473
+ --hash=sha256:78092232a4ab376a35d68c4e6d5e00dfd73454bd12b230420025fbe178ee3b0b \
474
+ --hash=sha256:78afba22027b4accef10dbd5eed84425930ba41b3ea0a86fa8d20baaf19d807f \
475
+ --hash=sha256:7bdb5e09068332578214cadd9c05e3d64d99e0e87591be22a324bdbc18925be0 \
476
+ --hash=sha256:8ce2e8411c7aaef53e6bb29fe98f28cd4fbd9a1d9be2eeea434331aac0536b22 \
477
+ --hash=sha256:9622e3b6c1d8b551b6e6f21873bdcc55762b4b2126633014cea1803368a9aa16 \
478
+ --hash=sha256:9ee66787e095127116d91dea2143db65c7bb1e232f617aa5957c0d9d2a3f23a7 \
479
+ --hash=sha256:a64dd61998416367b7ef979b73d3a85853ba9bec4c2925f74e588879a58716b6 \
480
+ --hash=sha256:ad275964d52e2243430472fc5d2c2334b4fc3ff9c16cb0a19254e25efa03a155 \
481
+ --hash=sha256:b0e130705d568e2f43a17bcbe74d90958e8a16263868a12c3e0d9c8162690830 \
482
+ --hash=sha256:b2dbea1012ccb784a65349f57bbc93730b96e85b42e9bf7b01ef40443db720b4 \
483
+ --hash=sha256:d189ba1bebfbc0c0e529159631ec72bb9e9bc041f01ec6d3233d6d82eb823bc1 \
484
+ --hash=sha256:da3104c57bbd72948d75f6a9389e6727d2ab6333c3617f0a89d72d4940aa0443 \
485
+ --hash=sha256:e0b55f27f584ed623221cfe995c912c61606be8513bfa0e07d2c674b4516d9dd \
486
+ --hash=sha256:f91ebf30830a48c825590aede79376cb40f110b387c17ee9bd59932c961044f9 \
487
+ --hash=sha256:fdec757fea0b793056419bca3e9932eb2b0ceec90ef4813ea4c1e072c389eb28 \
488
+ --hash=sha256:fe15238d3798788d00716637b3d4e7bb6bde18b26e5d08335a96e88564a36b6b
489
+ # via gradio
490
+ platformdirs==4.3.8 \
491
+ --hash=sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc \
492
+ --hash=sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4
493
+ # via pooch
494
+ pooch==1.8.2 \
495
+ --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
496
+ --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
497
+ # via librosa
498
+ pycparser==2.22 \
499
+ --hash=sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6 \
500
+ --hash=sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc
501
+ # via cffi
502
+ pydantic==2.11.5 \
503
+ --hash=sha256:7f853db3d0ce78ce8bbb148c401c2cdd6431b3473c0cdff2755c7690952a7b7a \
504
+ --hash=sha256:f9c26ba06f9747749ca1e5c94d6a85cb84254577553c8785576fd38fa64dc0f7
505
+ # via
506
+ # fastapi
507
+ # gradio
508
+ pydantic-core==2.33.2 \
509
+ --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \
510
+ --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \
511
+ --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \
512
+ --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \
513
+ --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \
514
+ --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \
515
+ --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \
516
+ --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \
517
+ --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \
518
+ --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \
519
+ --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \
520
+ --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \
521
+ --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \
522
+ --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \
523
+ --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \
524
+ --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \
525
+ --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \
526
+ --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \
527
+ --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \
528
+ --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \
529
+ --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \
530
+ --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \
531
+ --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \
532
+ --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \
533
+ --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \
534
+ --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \
535
+ --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \
536
+ --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \
537
+ --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \
538
+ --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \
539
+ --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \
540
+ --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6
541
+ # via pydantic
542
+ pydub==0.25.1 \
543
+ --hash=sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6 \
544
+ --hash=sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f
545
+ # via gradio
546
+ pygments==2.19.1 ; sys_platform != 'emscripten' \
547
+ --hash=sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f \
548
+ --hash=sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c
549
+ # via rich
550
+ python-dateutil==2.9.0.post0 \
551
+ --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \
552
+ --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427
553
+ # via pandas
554
+ python-multipart==0.0.20 \
555
+ --hash=sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104 \
556
+ --hash=sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13
557
+ # via gradio
558
+ pytz==2025.2 \
559
+ --hash=sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3 \
560
+ --hash=sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00
561
+ # via pandas
562
+ pyworld==0.3.5 \
563
+ --hash=sha256:1b93e53cddb67a0e4faa34d6cf919ac6c662feb1c8c0ed901d71b595ab396aa3 \
564
+ --hash=sha256:59b48961c2ac34fb01efeb1a77d3eda69c41b676858cbc3a82dfb7602c0c762b \
565
+ --hash=sha256:860c5c3528f1dbc5c68fa71a16e3bb6990244619e5b9baf62952f3a6bfc6131c
566
+ # via utau-webui
567
+ pyyaml==6.0.2 \
568
+ --hash=sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48 \
569
+ --hash=sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133 \
570
+ --hash=sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484 \
571
+ --hash=sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5 \
572
+ --hash=sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc \
573
+ --hash=sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1 \
574
+ --hash=sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652 \
575
+ --hash=sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 \
576
+ --hash=sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 \
577
+ --hash=sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 \
578
+ --hash=sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563 \
579
+ --hash=sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b \
580
+ --hash=sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425 \
581
+ --hash=sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183 \
582
+ --hash=sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab \
583
+ --hash=sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 \
584
+ --hash=sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e \
585
+ --hash=sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4 \
586
+ --hash=sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba
587
+ # via
588
+ # gradio
589
+ # huggingface-hub
590
+ requests==2.32.4 \
591
+ --hash=sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c \
592
+ --hash=sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422
593
+ # via
594
+ # huggingface-hub
595
+ # pooch
596
+ resampy==0.4.3 \
597
+ --hash=sha256:a0d1c28398f0e55994b739650afef4e3974115edbe96cd4bb81968425e916e47 \
598
+ --hash=sha256:ad2ed64516b140a122d96704e32bc0f92b23f45419e8b8f478e5a05f83edcebd
599
+ # via utau-webui
600
+ rich==14.0.0 ; sys_platform != 'emscripten' \
601
+ --hash=sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0 \
602
+ --hash=sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725
603
+ # via typer
604
+ ruff==0.11.13 ; sys_platform != 'emscripten' \
605
+ --hash=sha256:1808b3ed53e1a777c2ef733aca9051dc9bf7c99b26ece15cb59a0320fbdbd629 \
606
+ --hash=sha256:26816a218ca6ef02142343fd24c70f7cd8c5aa6c203bca284407adf675984432 \
607
+ --hash=sha256:26fa247dc68d1d4e72c179e08889a25ac0c7ba4d78aecfc835d49cbfd60bf514 \
608
+ --hash=sha256:29c3189895a8a6a657b7af4e97d330c8a3afd2c9c8f46c81e2fc5a31866517e3 \
609
+ --hash=sha256:4a9ddd3ec62a9a89578c85842b836e4ac832d4a2e0bfaad3b02243f930ceafcc \
610
+ --hash=sha256:4bdfbf1240533f40042ec00c9e09a3aade6f8c10b6414cf11b519488d2635d46 \
611
+ --hash=sha256:4ffbc82d70424b275b089166310448051afdc6e914fdab90e08df66c43bb5ca9 \
612
+ --hash=sha256:51c3f95abd9331dc5b87c47ac7f376db5616041173826dfd556cfe3d4977f492 \
613
+ --hash=sha256:53b15a9dfdce029c842e9a5aebc3855e9ab7771395979ff85b7c1dedb53ddc2b \
614
+ --hash=sha256:55e4bc3a77842da33c16d55b32c6cac1ec5fb0fbec9c8c513bdce76c4f922165 \
615
+ --hash=sha256:633bf2c6f35678c56ec73189ba6fa19ff1c5e4807a78bf60ef487b9dd272cc71 \
616
+ --hash=sha256:6c51f93029d54a910d3d24f7dd0bb909e31b6cd989a5e4ac513f4eb41629f0dc \
617
+ --hash=sha256:96c27935418e4e8e77a26bb05962817f28b8ef3843a6c6cc49d8783b5507f250 \
618
+ --hash=sha256:ab153241400789138d13f362c43f7edecc0edfffce2afa6a68434000ecd8f69a \
619
+ --hash=sha256:aef9c9ed1b5ca28bb15c7eac83b8670cf3b20b478195bd49c8d756ba0a36cf48 \
620
+ --hash=sha256:b4385285e9179d608ff1d2fb9922062663c658605819a6876d8beef0c30b7f3b \
621
+ --hash=sha256:d237a496e0778d719efb05058c64d28b757c77824e04ffe8796c7436e26712b7 \
622
+ --hash=sha256:d28ce58b5ecf0f43c1b71edffabe6ed7f245d5336b17805803312ec9bc665933
623
+ # via gradio
624
+ safehttpx==0.1.6 \
625
+ --hash=sha256:407cff0b410b071623087c63dd2080c3b44dc076888d8c5823c00d1e58cb381c \
626
+ --hash=sha256:b356bfc82cee3a24c395b94a2dbeabbed60aff1aa5fa3b5fe97c4f2456ebce42
627
+ # via gradio
628
+ scikit-learn==1.7.0 \
629
+ --hash=sha256:014e07a23fe02e65f9392898143c542a50b6001dbe89cb867e19688e468d049b \
630
+ --hash=sha256:0521cb460426c56fee7e07f9365b0f45ec8ca7b2d696534ac98bfb85e7ae4775 \
631
+ --hash=sha256:0b2f8a0b1e73e9a08b7cc498bb2aeab36cdc1f571f8ab2b35c6e5d1c7115d97d \
632
+ --hash=sha256:126c09740a6f016e815ab985b21e3a0656835414521c81fc1a8da78b679bdb75 \
633
+ --hash=sha256:1babf2511e6ffd695da7a983b4e4d6de45dce39577b26b721610711081850906 \
634
+ --hash=sha256:317ca9f83acbde2883bd6bb27116a741bfcb371369706b4f9973cf30e9a03b0d \
635
+ --hash=sha256:34cc8d9d010d29fb2b7cbcd5ccc24ffdd80515f65fe9f1e4894ace36b267ce19 \
636
+ --hash=sha256:5abd2acff939d5bd4701283f009b01496832d50ddafa83c90125a4e41c33e314 \
637
+ --hash=sha256:5b7974f1f32bc586c90145df51130e02267e4b7e77cab76165c76cf43faca0d9 \
638
+ --hash=sha256:63017a5f9a74963d24aac7590287149a8d0f1a0799bbe7173c0d8ba1523293c0 \
639
+ --hash=sha256:9f39f6a811bf3f15177b66c82cbe0d7b1ebad9f190737dcdef77cfca1ea3c19c \
640
+ --hash=sha256:c01e869b15aec88e2cdb73d27f15bdbe03bce8e2fb43afbe77c45d399e73a5a3 \
641
+ --hash=sha256:c2c7243d34aaede0efca7a5a96d67fddaebb4ad7e14a70991b9abee9dc5c0379 \
642
+ --hash=sha256:e39d95a929b112047c25b775035c8c234c5ca67e681ce60d12413afb501129f7 \
643
+ --hash=sha256:e7e7ced20582d3a5516fb6f405fd1d254e1f5ce712bfef2589f51326af6346e8
644
+ # via librosa
645
+ scipy==1.15.3 \
646
+ --hash=sha256:05dc6abcd105e1a29f95eada46d4a3f251743cfd7d3ae8ddb4088047f24ea477 \
647
+ --hash=sha256:06efcba926324df1696931a57a176c80848ccd67ce6ad020c810736bfd58eb1c \
648
+ --hash=sha256:0a769105537aa07a69468a0eefcd121be52006db61cdd8cac8a0e68980bbb723 \
649
+ --hash=sha256:0bdd905264c0c9cfa74a4772cdb2070171790381a5c4d312c973382fc6eaf730 \
650
+ --hash=sha256:0ff17c0bb1cb32952c09217d8d1eed9b53d1463e5f1dd6052c7857f83127d539 \
651
+ --hash=sha256:14ed70039d182f411ffc74789a16df3835e05dc469b898233a245cdfd7f162cb \
652
+ --hash=sha256:185cd3d6d05ca4b44a8f1595af87f9c372bb6acf9c808e99aa3e9aa03bd98cf6 \
653
+ --hash=sha256:271e3713e645149ea5ea3e97b57fdab61ce61333f97cfae392c28ba786f9bb49 \
654
+ --hash=sha256:2c620736bcc334782e24d173c0fdbb7590a0a436d2fdf39310a8902505008759 \
655
+ --hash=sha256:3ac07623267feb3ae308487c260ac684b32ea35fd81e12845039952f558047b8 \
656
+ --hash=sha256:40e54d5c7e7ebf1aa596c374c49fa3135f04648a0caabcb66c52884b943f02b4 \
657
+ --hash=sha256:50f9e62461c95d933d5c5ef4a1f2ebf9a2b4e83b0db374cb3f1de104d935922e \
658
+ --hash=sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed \
659
+ --hash=sha256:5e721fed53187e71d0ccf382b6bf977644c533e506c4d33c3fb24de89f5c3ed5 \
660
+ --hash=sha256:6487aa99c2a3d509a5227d9a5e889ff05830a06b2ce08ec30df6d79db5fcd5c5 \
661
+ --hash=sha256:6ac6310fdbfb7aa6612408bd2f07295bcbd3fda00d2d702178434751fe48e019 \
662
+ --hash=sha256:6cfd56fc1a8e53f6e89ba3a7a7251f7396412d655bca2aa5611c8ec9a6784a1e \
663
+ --hash=sha256:76ad1fb5f8752eabf0fa02e4cc0336b4e8f021e2d5f061ed37d6d264db35e3ca \
664
+ --hash=sha256:79167bba085c31f38603e11a267d862957cbb3ce018d8b38f79ac043bc92d825 \
665
+ --hash=sha256:7e11270a000969409d37ed399585ee530b9ef6aa99d50c019de4cb01e8e54e62 \
666
+ --hash=sha256:8c9ed3ba2c8a2ce098163a9bdb26f891746d02136995df25227a20e71c396ebb \
667
+ --hash=sha256:9db984639887e3dffb3928d118145ffe40eff2fa40cb241a306ec57c219ebbbb \
668
+ --hash=sha256:b90ab29d0c37ec9bf55424c064312930ca5f4bde15ee8619ee44e69319aab163 \
669
+ --hash=sha256:c05045d8b9bfd807ee1b9f38761993297b10b245f012b11b13b91ba8945f7e45 \
670
+ --hash=sha256:c9deabd6d547aee2c9a81dee6cc96c6d7e9a9b1953f74850c179f91fdc729cb7 \
671
+ --hash=sha256:dde4fc32993071ac0c7dd2d82569e544f0bdaff66269cb475e0f369adad13f11 \
672
+ --hash=sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf \
673
+ --hash=sha256:f77f853d584e72e874d87357ad70f44b437331507d1c311457bed8ed2b956126
674
+ # via
675
+ # librosa
676
+ # scikit-learn
677
+ # utau-webui
678
+ semantic-version==2.10.0 \
679
+ --hash=sha256:bdabb6d336998cbb378d4b9db3a4b56a1e3235701dc05ea2690d9a997ed5041c \
680
+ --hash=sha256:de78a3b8e0feda74cabc54aab2da702113e33ac9d9eb9d2389bcf1f58b7d9177
681
+ # via gradio
682
+ setuptools==80.9.0 \
683
+ --hash=sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922 \
684
+ --hash=sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c
685
+ # via utau-webui
686
+ shellingham==1.5.4 ; sys_platform != 'emscripten' \
687
+ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \
688
+ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de
689
+ # via typer
690
+ six==1.17.0 \
691
+ --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
692
+ --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
693
+ # via python-dateutil
694
+ sniffio==1.3.1 \
695
+ --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \
696
+ --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc
697
+ # via anyio
698
+ soundfile==0.13.1 \
699
+ --hash=sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618 \
700
+ --hash=sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9 \
701
+ --hash=sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593 \
702
+ --hash=sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33 \
703
+ --hash=sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb \
704
+ --hash=sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445 \
705
+ --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \
706
+ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
707
+ # via
708
+ # librosa
709
+ # utau-webui
710
+ soxr==0.5.0.post1 \
711
+ --hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \
712
+ --hash=sha256:7092b9f3e8a416044e1fa138c8172520757179763b85dc53aa9504f4813cff73 \
713
+ --hash=sha256:a3f16810dd649ab1f433991d2a9661e9e6a116c2b4101039b53b3c3e90a094fc \
714
+ --hash=sha256:b1be9fee90afb38546bdbd7bde714d1d9a8c5a45137f97478a83b65e7f3146f6 \
715
+ --hash=sha256:bd052a66471a7335b22a6208601a9d0df7b46b8d087dce4ff6e13eed6a33a2a1 \
716
+ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
717
+ # via librosa
718
+ standard-aifc==3.13.0 ; python_full_version >= '3.13' \
719
+ --hash=sha256:64e249c7cb4b3daf2fdba4e95721f811bde8bdfc43ad9f936589b7bb2fae2e43 \
720
+ --hash=sha256:f7ae09cc57de1224a0dd8e3eb8f73830be7c3d0bc485de4c1f82b4a7f645ac66
721
+ # via librosa
722
+ standard-chunk==3.13.0 ; python_full_version >= '3.13' \
723
+ --hash=sha256:17880a26c285189c644bd5bd8f8ed2bdb795d216e3293e6dbe55bbd848e2982c \
724
+ --hash=sha256:4ac345d37d7e686d2755e01836b8d98eda0d1a3ee90375e597ae43aaf064d654
725
+ # via standard-aifc
726
+ standard-sunau==3.13.0 ; python_full_version >= '3.13' \
727
+ --hash=sha256:53af624a9529c41062f4c2fd33837f297f3baa196b0cfceffea6555654602622 \
728
+ --hash=sha256:b319a1ac95a09a2378a8442f403c66f4fd4b36616d6df6ae82b8e536ee790908
729
+ # via librosa
730
+ starlette==0.46.2 \
731
+ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \
732
+ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5
733
+ # via
734
+ # fastapi
735
+ # gradio
736
+ threadpoolctl==3.6.0 \
737
+ --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
738
+ --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
739
+ # via scikit-learn
740
+ tomlkit==0.13.3 \
741
+ --hash=sha256:430cf247ee57df2b94ee3fbe588e71d362a941ebb545dec29b53961d61add2a1 \
742
+ --hash=sha256:c89c649d79ee40629a9fda55f8ace8c6a1b42deb912b2a8fd8d942ddadb606b0
743
+ # via gradio
744
+ tqdm==4.67.1 \
745
+ --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \
746
+ --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2
747
+ # via huggingface-hub
748
+ typer==0.16.0 ; sys_platform != 'emscripten' \
749
+ --hash=sha256:1f79bed11d4d02d4310e3c1b7ba594183bcedb0ac73b27a9e5f28f6fb5b98855 \
750
+ --hash=sha256:af377ffaee1dbe37ae9440cb4e8f11686ea5ce4e9bae01b84ae7c63b87f1dd3b
751
+ # via gradio
752
+ typing-extensions==4.14.0 \
753
+ --hash=sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4 \
754
+ --hash=sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af
755
+ # via
756
+ # anyio
757
+ # fastapi
758
+ # gradio
759
+ # gradio-client
760
+ # huggingface-hub
761
+ # librosa
762
+ # pydantic
763
+ # pydantic-core
764
+ # typer
765
+ # typing-inspection
766
+ typing-inspection==0.4.1 \
767
+ --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \
768
+ --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28
769
+ # via pydantic
770
+ tzdata==2025.2 \
771
+ --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \
772
+ --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9
773
+ # via pandas
774
+ urllib3==2.4.0 \
775
+ --hash=sha256:414bc6535b787febd7567804cc015fee39daab8ad86268f1310a9250697de466 \
776
+ --hash=sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813
777
+ # via
778
+ # gradio
779
+ # requests
780
+ uvicorn==0.34.3 ; sys_platform != 'emscripten' \
781
+ --hash=sha256:16246631db62bdfbf069b0645177d6e8a77ba950cfedbfd093acef9444e4d885 \
782
+ --hash=sha256:35919a9a979d7a59334b6b10e05d77c1d0d574c50e0fc98b8b1a0f165708b55a
783
+ # via gradio
784
+ websockets==15.0.1 \
785
+ --hash=sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2 \
786
+ --hash=sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5 \
787
+ --hash=sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8 \
788
+ --hash=sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375 \
789
+ --hash=sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597 \
790
+ --hash=sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f \
791
+ --hash=sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3 \
792
+ --hash=sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4 \
793
+ --hash=sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665 \
794
+ --hash=sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22 \
795
+ --hash=sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675 \
796
+ --hash=sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4 \
797
+ --hash=sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65 \
798
+ --hash=sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151 \
799
+ --hash=sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d \
800
+ --hash=sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee \
801
+ --hash=sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa \
802
+ --hash=sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9 \
803
+ --hash=sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe \
804
+ --hash=sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561 \
805
+ --hash=sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215 \
806
+ --hash=sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931 \
807
+ --hash=sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f \
808
+ --hash=sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7
809
+ # via gradio-client
straycat.py ADDED
@@ -0,0 +1,825 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ straycat - Yet another WORLD-based UTAU resampler.
4
+
5
+ Original source: https://github.com/UtaUtaUtau/straycat
6
+ Copyright (c) UtaUtaUtau
7
+ Licensed under MIT License
8
+
9
+ This file is part of the straycat project and is used under the terms
10
+ of the MIT License. See the original repository for full license text.
11
+ """
12
+
13
+ import logging
14
+ logging.basicConfig(format='%(message)s', level=logging.INFO)
15
+ import sys
16
+ import os
17
+ import pyworld as world # Vocoder
18
+ import numpy as np # Numpy <3
19
+ from numba import njit, vectorize, float64, optional # JIT compilation stuff (and ufuncs)
20
+ import soundfile as sf # WAV read + write
21
+ import scipy.signal as signal # for filtering
22
+ import scipy.interpolate as interp # Interpolator for feats
23
+ import scipy.ndimage as ndimage
24
+ import resampy # Resampler (as in sampling rate stuff)
25
+ from pathlib import Path # path manipulation
26
+ import re
27
+
28
+ version = '0.4.0'
29
+ help_string = '''usage: straycat in_file out_file pitch velocity [flags] [offset] [length] [consonant] [cutoff] [volume] [modulation] [tempo] [pitch_string]
30
+
31
+ Resamples using the WORLD Vocoder.
32
+
33
+ arguments:
34
+ \tin_file\t\tPath to input file.
35
+ \tout_file\tPath to output file.
36
+ \tpitch\t\tThe pitch to render on.
37
+ \tvelocity\tThe consonant velocity of the render.
38
+
39
+ optional arguments:
40
+ \tflags\t\tThe flags of the render.
41
+ \toffset\t\tThe offset from the start of the render area of the sample. (default: 0)
42
+ \tlength\t\tThe length of the stretched area in milliseconds. (default: 1000)
43
+ \tconsonant\tThe unstretched area of the render in milliseconds. (default: 0)
44
+ \tcutoff\t\tThe cutoff from the end or from the offset for the render area of the sample. (default: 0)
45
+ \tvolume\t\tThe volume of the render in percentage. (default: 100)
46
+ \tmodulation\tThe pitch modulation of the render in percentage. (default: 0)
47
+ \ttempo\t\tThe tempo of the render. Needs to have a ! at the start. (default: !100)
48
+ \tpitch_string\tThe UTAU pitchbend parameter written in Base64 with RLE encoding. (default: AA)'''
49
+
50
+ notes = {'C' : 0, 'C#' : 1, 'D' : 2, 'D#' : 3, 'E' : 4, 'F' : 5, 'F#' : 6, 'G' : 7, 'G#' : 8, 'A' : 9, 'A#' : 10, 'B' : 11} # Note names lol
51
+ note_re = re.compile(r'([A-G]#?)(-?\d+)') # Note Regex for conversion
52
+ default_fs = 44100 # UTAU only really likes 44.1khz
53
+ fft_size = world.get_cheaptrick_fft_size(default_fs, world.default_f0_floor) # It's just 2048 but you know
54
+ cache_ext = '.sc.npz' # cache file extension
55
+
56
+ # Giving it better range
57
+ f0_floor = world.default_f0_floor
58
+ f0_ceil = 1760
59
+
60
+ # Flags
61
+ flags = ['fe', 'fl', 'fo', 'fv', 'fp', 've', 'vo', 'g', 't', 'A', 'B', 'G', 'P', 'S', 'p', 'R', 'D', 'C']
62
+ flag_re = '|'.join(flags)
63
+ flag_re = f'({flag_re})([+-]?\\d+)?'
64
+ flag_re = re.compile(flag_re)
65
+
66
+ # Utility functions
67
+ @vectorize([float64(float64, float64, float64)], nopython=True)
68
+ def smoothstep(edge0, edge1, x):
69
+ """Smoothstep function from GLSL that works with numpy arrays."""
70
+ x = (x - edge0) / (edge1 - edge0)
71
+ if x < 0:
72
+ x = 0
73
+ elif x > 1:
74
+ x = 1
75
+ return 3*x*x - 2*x*x*x
76
+
77
+ @vectorize([float64(float64, float64, float64)], nopython=True)
78
+ def clip(x, x_min, x_max):
79
+ """Clips function. Faster than np.clip somehow"""
80
+ if x < x_min:
81
+ return x_min
82
+ if x > x_max:
83
+ return x_max
84
+ return x
85
+
86
+ @vectorize([float64(float64, float64)], nopython=True)
87
+ def bias(x, a):
88
+ """Element-wise Schlick bias function."""
89
+ if a == 0:
90
+ return 0
91
+ if a == 1:
92
+ return 1
93
+ return x / ((1 / a - 2) * (1 - x) + 1)
94
+
95
+ def highpass(x, fs=44100, cutoff=3000, order=1):
96
+ """Butterworth highpass with doubled order because of sosfiltfilt."""
97
+ nyq = 0.5 * fs
98
+ cut = cutoff / nyq
99
+ sos = signal.butter(order, cut, btype='high', output='sos')
100
+ return signal.sosfiltfilt(sos, x)
101
+
102
+ def lowpass(x, fs=44100, cutoff=16000, order=1):
103
+ """Butterworth lowpass with doubled order because of sosfiltfilt."""
104
+ nyq = 0.5 * fs
105
+ cut = cutoff / nyq
106
+ sos = signal.butter(order, cut, btype='low', output='sos')
107
+ return signal.sosfiltfilt(sos, x)
108
+
109
+ # Pitch string interpreter
110
+ def to_uint6(b64):
111
+ """Convert one Base64 character to an unsigned integer.
112
+
113
+ Parameters
114
+ ----------
115
+ b64 : str
116
+ The Base64 character.
117
+
118
+ Returns
119
+ -------
120
+ int
121
+ The equivalent of the Base64 character as an integer.
122
+ """
123
+ c = ord(b64) # Convert based on ASCII mapping
124
+ if c >= 97:
125
+ return c - 71
126
+ elif c >= 65:
127
+ return c - 65
128
+ elif c >= 48:
129
+ return c + 4
130
+ elif c == 43:
131
+ return 62
132
+ elif c == 47:
133
+ return 63
134
+ else:
135
+ raise Exception
136
+
137
+ def to_int12(b64):
138
+ """Converts two Base64 characters to a signed 12-bit integer.
139
+
140
+ Parameters
141
+ ----------
142
+ b64 : str
143
+ The Base64 string.
144
+
145
+ Returns
146
+ -------
147
+ int
148
+ The equivalent of the Base64 characters as a signed 12-bit integer (-2047 to 2048)
149
+ """
150
+ uint12 = to_uint6(b64[0]) << 6 | to_uint6(b64[1]) # Combined uint6 to uint12
151
+ if uint12 >> 11 & 1 == 1: # Check most significant bit to simulate two's complement
152
+ return uint12 - 4096
153
+ else:
154
+ return uint12
155
+
156
+ def to_int12_stream(b64):
157
+ """Converts a Base64 string to a list of integers.
158
+
159
+ Parameters
160
+ ----------
161
+ b64 : str
162
+ The Base64 string.
163
+
164
+ Returns
165
+ -------
166
+ list
167
+ The equivalent of the Base64 string if split every 12-bits and interpreted as a signed 12-bit integer.
168
+ """
169
+ res = []
170
+ for i in range(0, len(b64), 2):
171
+ res.append(to_int12(b64[i:i+2]))
172
+ return res
173
+
174
+ def pitch_string_to_cents(x):
175
+ """Converts UTAU's pitchbend argument to an ndarray representing the pitch offset in cents.
176
+
177
+ Parameters
178
+ ----------
179
+ x : str
180
+ The pitchbend argument.
181
+
182
+ Returns
183
+ -------
184
+ ndarray
185
+ The pitchbend argument as pitch offset in cents.
186
+ """
187
+ pitch = x.split('#') # Split RLE Encoding
188
+ res = []
189
+ for i in range(0, len(pitch), 2):
190
+ # Go through each pair
191
+ p = pitch[i:i+2]
192
+ if len(p) == 2:
193
+ # Decode pitch string and extend RLE
194
+ pitch_str, rle = p
195
+ res.extend(to_int12_stream(pitch_str))
196
+ res.extend([res[-1]] * int(rle))
197
+ else:
198
+ # Decode last pitch string without RLE if it exists
199
+ res.extend(to_int12_stream(p[0]))
200
+ res = np.array(res, dtype=np.int32)
201
+ if np.all(res == res[0]):
202
+ return np.zeros(res.shape)
203
+ else:
204
+ return np.concatenate([res, np.zeros(1)])
205
+
206
+ # Pitch conversion
207
+ def note_to_midi(x):
208
+ """Note name to MIDI note number."""
209
+ note, octave = note_re.match(x).group(1, 2)
210
+ octave = int(octave) + 1
211
+ return octave * 12 + notes[note]
212
+
213
+ def midi_to_hz(x):
214
+ """MIDI note number to Hertz using equal temperament. A4 = 440 Hz."""
215
+ return 440 * np.exp2((x - 69) / 12)
216
+
217
+ ##def hz_to_midi(x):
218
+ ## return 12 * np.log2(x / 440) + 69
219
+
220
+ # WAV read/write
221
+ def read_wav(loc):
222
+ """Read audio files supported by soundfile and resample to 44.1kHz if needed. Mixes down to mono if needed.
223
+
224
+ Parameters
225
+ ----------
226
+ loc : str or file
227
+ Input audio file.
228
+
229
+ Returns
230
+ -------
231
+ ndarray
232
+ Data read from WAV file remapped to [-1, 1] and in 44.1kHz
233
+ """
234
+ if type(loc) == str: # make sure input is Path
235
+ loc = Path(loc)
236
+
237
+ exists = loc.exists()
238
+ if not exists: # check for alternative files
239
+ for ext in sf.available_formats().keys():
240
+ loc = loc.with_suffix('.' + ext.lower())
241
+ exists = loc.exists()
242
+ if exists:
243
+ break
244
+
245
+ if not exists:
246
+ raise FileNotFoundError("No supported audio file was found.")
247
+
248
+ x, fs = sf.read(loc)
249
+ if len(x.shape) == 2:
250
+ # Average all channels... Probably not too good for formats bigger than stereo
251
+ x = np.mean(x, axis=1)
252
+
253
+ if fs != default_fs:
254
+ x = resampy.resample(x, fs, default_fs)
255
+
256
+ return x
257
+
258
+ def save_wav(loc, x):
259
+ """Save data into a WAV file.
260
+
261
+ Parameters
262
+ ----------
263
+ loc : str or file
264
+ Output WAV file.
265
+
266
+ x : ndarray
267
+ Audio data in 44.1kHz within [-1, 1].
268
+
269
+ Returns
270
+ -------
271
+ None
272
+ """
273
+ sf.write(loc, x, default_fs, 'PCM_16')
274
+
275
+ # Processing WORLD things
276
+ @njit(float64(float64[:], optional(float64), optional(float64)))
277
+ def _jit_base_frq(f0, f0_min, f0_max):
278
+ q = 0
279
+ avg_frq = 0
280
+ tally = 0
281
+ N = len(f0)
282
+
283
+ if f0_min is None:
284
+ f0_min = f0_floor
285
+
286
+ if f0_max is None:
287
+ f0_max = f0_ceil
288
+
289
+ for i in range(N):
290
+ if f0[i] >= f0_min and f0[i] <= f0_max:
291
+ if i < 1:
292
+ q = f0[i+1] - f0[i]
293
+ elif i == N - 1:
294
+ q = f0[i] - f0[i-1]
295
+ else:
296
+ q = (f0[i+1] - f0[i-1]) / 2
297
+ weight = 2 ** (-q * q)
298
+ avg_frq += f0[i] * weight
299
+ tally += weight
300
+
301
+ if tally > 0:
302
+ avg_frq /= tally
303
+ return avg_frq
304
+
305
+ def base_frq(f0, f0_min=None, f0_max=None):
306
+ """Get average F0 with a stronger bias on flatter areas.
307
+
308
+ Parameters
309
+ ----------
310
+ f0 : list or ndarray
311
+ Array of F0 values.
312
+
313
+ f0_min : float, optional
314
+ Lower F0 limit.
315
+
316
+ f0_max : float, optional
317
+ Upper F0 limit.
318
+
319
+ Returns
320
+ -------
321
+ float
322
+ Average F0.
323
+ """
324
+ return _jit_base_frq(f0, f0_min, f0_max)
325
+
326
+ class Resampler:
327
+ """
328
+ A class for the UTAU resampling process.
329
+
330
+ Attributes
331
+ ----------
332
+ in_file : str
333
+ Path to input file.
334
+
335
+ out_file : str
336
+ Path to output file.
337
+
338
+ pitch : str
339
+ The pitch of the note.
340
+
341
+ velocity : str or float
342
+ The consonant velocity of the note.
343
+
344
+ flags : str
345
+ The flags of the note.
346
+
347
+ offset : str or float
348
+ The offset from the start for the render area of the sample.
349
+
350
+ length : str or int
351
+ The length of the stretched area in milliseconds.
352
+
353
+ consonant : str or float
354
+ The unstretched area of the render.
355
+
356
+ cutoff : str or float
357
+ The cutoff from the end or from the offset for the render area of the sample.
358
+
359
+ volume : str or float
360
+ The volume of the note in percentage.
361
+
362
+ modulation : str or float
363
+ The modulation of the note in percentage.
364
+
365
+ tempo : str
366
+ The tempo of the note.
367
+
368
+ pitch_string : str
369
+ The UTAU pitchbend parameter.
370
+
371
+ Methods
372
+ -------
373
+ render(self):
374
+ The rendering workflow. Immediately starts when class is initialized.
375
+
376
+ get_features(self):
377
+ Gets the WORLD features either from a cached file or generating it if it doesn't exist.
378
+
379
+ generate_features(self, features_path):
380
+ Generates WORLD features and saves it for later.
381
+
382
+ resample(self, features):
383
+ Renders a WAV file using the passed WORLD features.
384
+ """
385
+ def __init__(self, in_file, out_file, pitch, velocity, flags='', offset=0, length=1000, consonant=0, cutoff=0, volume=100, modulation=0, tempo='!100', pitch_string='AA'):
386
+ """Initializes the renderer and immediately starts it.
387
+
388
+ Parameters
389
+ ---------
390
+ in_file : str
391
+ Path to input file.
392
+
393
+ out_file : str
394
+ Path to output file.
395
+
396
+ pitch : str
397
+ The pitch of the note.
398
+
399
+ velocity : str or float
400
+ The consonant velocity of the note.
401
+
402
+ flags : str
403
+ The flags of the note.
404
+
405
+ offset : str or float
406
+ The offset from the start for the render area of the sample.
407
+
408
+ length : str or int
409
+ The length of the stretched area in milliseconds.
410
+
411
+ consonant : str or float
412
+ The unstretched area of the render.
413
+
414
+ cutoff : str or float
415
+ The cutoff from the end or from the offset for the render area of the sample.
416
+
417
+ volume : str or float
418
+ The volume of the note in percentage.
419
+
420
+ modulation : str or float
421
+ The modulation of the note in percentage.
422
+
423
+ tempo : str
424
+ The tempo of the note.
425
+
426
+ pitch_string : str
427
+ The UTAU pitchbend parameter.
428
+ """
429
+ self.in_file = Path(in_file)
430
+ self.out_file = out_file
431
+ self.pitch = note_to_midi(pitch)
432
+ self.velocity = float(velocity)
433
+ self.flags = {k : int(v) if v else None for k, v in flag_re.findall(flags.replace('/', ''))}
434
+ self.offset = float(offset)
435
+ self.length = int(length)
436
+ self.consonant = float(consonant)
437
+ self.cutoff = float(cutoff)
438
+ self.volume = float(volume)
439
+ self.modulation = float(modulation)
440
+ self.tempo = float(tempo[1:])
441
+ self.pitchbend = pitch_string_to_cents(pitch_string)
442
+
443
+ self.render()
444
+
445
+ def render(self):
446
+ """The rendering workflow. Immediately starts when class is initialized.
447
+
448
+ Parameters
449
+ ----------
450
+ None
451
+ """
452
+ features = self.get_features()
453
+ self.resample(features)
454
+
455
+ def get_features(self):
456
+ """Gets the WORLD features either from a cached file or generating it if it doesn't exist.
457
+
458
+ Parameters
459
+ ----------
460
+ None
461
+
462
+ Returns
463
+ -------
464
+ features : dict
465
+ A dictionary of the F0, MGC, BAP, and average F0.
466
+ """
467
+ # Setup cache path file
468
+ fname = self.in_file.name
469
+ features_path = self.in_file.with_suffix(cache_ext)
470
+ features = None
471
+
472
+ if 'G' in self.flags.keys():
473
+ logging.info('G flag exists. Forcing feature generation.')
474
+ features = self.generate_features(features_path)
475
+ elif os.path.exists(features_path):
476
+ # Load if it exists
477
+ logging.info(f'Reading {fname}{cache_ext}.')
478
+ features = np.load(features_path)
479
+ else:
480
+ # Generate if not
481
+ logging.info(f'{fname}{cache_ext} not found. Generating features.')
482
+ features = self.generate_features(features_path)
483
+
484
+ return features
485
+
486
+ def generate_features(self, features_path):
487
+ """Generates WORLD features and saves it for later.
488
+
489
+ Parameters
490
+ ----------
491
+ features_path : str or file
492
+ The path for caching the features.
493
+
494
+ Returns
495
+ -------
496
+ features : dict
497
+ A dictionary of the F0, MGC, BAP, and average F0.
498
+ """
499
+ x = read_wav(self.in_file)
500
+
501
+ # Check if audio is long enough
502
+ min_samples = int(default_fs * 0.1) # 최소 100ms
503
+ if len(x) < min_samples:
504
+ logging.warning(f'Audio too short ({len(x)} samples < {min_samples}). Padding with zeros.')
505
+ # 패딩으로 최소 길이 보장
506
+ x = np.pad(x, (0, min_samples - len(x)), mode='constant', constant_values=0)
507
+
508
+ logging.info('Generating F0.')
509
+ f0, t = world.harvest(x, default_fs, f0_floor=f0_floor, f0_ceil=f0_ceil)
510
+ base_f0 = base_frq(f0)
511
+
512
+ logging.info('Generating spectral envelope.')
513
+ sp = world.cheaptrick(x, f0, t, default_fs)
514
+ mgc = world.code_spectral_envelope(sp, default_fs, 64)
515
+
516
+ logging.info('Generating aperiodicity.')
517
+ ap = world.d4c(x, f0, t, default_fs, threshold=0.25)
518
+ bap = world.code_aperiodicity(ap, default_fs)
519
+
520
+ logging.info('Saving features.')
521
+
522
+ features = {'base' : base_f0, 'f0' : f0, 'mgc' : mgc, 'bap' : bap}
523
+ np.savez_compressed(features_path, **features)
524
+
525
+ return features
526
+
527
+ def resample(self, features):
528
+ """Renders a WAV file using the passed WORLD features.
529
+
530
+ Parameters
531
+ ----------
532
+ features : dict
533
+ A dictionary of the F0, MGC, BAP, and average F0.
534
+
535
+ Returns
536
+ -------
537
+ None
538
+ """
539
+ if self.out_file == 'nul':
540
+ logging.info('Null output file. Skipping...')
541
+ return
542
+
543
+ self.out_file = Path(self.out_file)
544
+
545
+ # Convert percentages to decimal
546
+ vel = np.exp2(1 - self.velocity / 100) # convel is more a multiplier...
547
+ vol = self.volume / 100
548
+ mod = self.modulation / 100
549
+
550
+ logging.info('Decoding WORLD features.')
551
+ # Recalculate spectral envelope and aperiodicity
552
+ sp = world.decode_spectral_envelope(features['mgc'], default_fs, fft_size)
553
+ ap = world.decode_aperiodicity(features['bap'], default_fs, fft_size)
554
+
555
+ # Turn F0 to offset map for modulation
556
+ base_f0 = features['base']
557
+ f0 = features['f0']
558
+ f0[f0 == 0] = base_f0
559
+ f0_off = f0 - base_f0
560
+
561
+ # Calculate temporal positions
562
+ t_area = np.arange(len(f0)) * 0.005
563
+
564
+ logging.info('Calculating timing.') # use seconds instead of 5ms terms cuz someone gave me negative offsets </3
565
+ start = self.offset / 1000 # start time
566
+ end = self.cutoff / 1000 # end time
567
+ if self.cutoff < 0: # deal with relative end time
568
+ end = start - end
569
+ else:
570
+ end = t_area[-1] - end
571
+ con = start + self.consonant / 1000 # consonant
572
+
573
+ logging.info('Preparing interpolators.')
574
+ # Check if we have enough data points for interpolation
575
+ if len(t_area) < 2 or len(f0_off) < 2:
576
+ logging.error(f'Insufficient data for interpolation: t_area={len(t_area)}, f0_off={len(f0_off)}')
577
+ # Create a minimal valid signal
578
+ if len(t_area) < 2:
579
+ t_area = np.array([0.0, 0.01]) # 10ms minimum
580
+ if len(f0_off) < 2:
581
+ f0_off = np.array([0.0, 0.0])
582
+ if len(sp) < 2:
583
+ sp = np.repeat(sp[:1], 2, axis=0) if len(sp) == 1 else np.zeros((2, fft_size//2+1))
584
+ if len(ap) < 2:
585
+ ap = np.repeat(ap[:1], 2, axis=0) if len(ap) == 1 else np.zeros((2, fft_size//2+1))
586
+
587
+ # Make interpolators to render new areas
588
+ f0_off_interp = interp.UnivariateSpline(t_area, f0_off, s=0, ext='const')
589
+ sp_interp = interp.Akima1DInterpolator(t_area, sp)
590
+ ap_interp = interp.Akima1DInterpolator(t_area, ap)
591
+
592
+ # Make new temporal positions array for stretching
593
+ t_consonant = np.linspace(start, con, num=int(vel * self.consonant / 5), endpoint=False) # temporal positions of the unstretched area. can be stretched because of velocity
594
+ # stretched area only needs to stretch if the length required is longer than the stretch area
595
+ length_req = self.length / 1000
596
+ stretch_length = end - con
597
+ if stretch_length > length_req:
598
+ con_idx = int(200 * con) # position of consonant in the temporal positions array ??
599
+ len_idx = int(200 * length_req) # length of length required by 5ms frames
600
+ t_stretch = t_area[con_idx:con_idx+len_idx]
601
+ else:
602
+ t_stretch = np.linspace(con, end, num=int(200 * length_req))
603
+
604
+ t_render = clip(np.concatenate([t_consonant, t_stretch]), 0, t_area[-1]) # concatenate and clip for interpolation
605
+ con = len(t_consonant) # new placement of the consonant, now in 5ms frame terms...
606
+
607
+ logging.info('Interpolating WORLD features.')
608
+ # Interpolate render area
609
+ f0_off_render = f0_off_interp(t_render)
610
+ sp_render = sp_interp(t_render)
611
+ ap_render = clip(ap_interp(t_render), 0, 1) # aperiodicity freaks out if not within [0, 1] range
612
+
613
+ # Calculate new temporal positions for tuning
614
+ t = np.arange(len(sp_render)) * 0.005
615
+
616
+ logging.info('Calculating pitch.')
617
+ # Calculate pitch in MIDI note number terms
618
+ pitch = self.pitchbend / 100 + self.pitch
619
+ t_pitch = 60 * np.arange(len(pitch)) / (self.tempo * 96)
620
+
621
+ # Check if we have enough pitch data points
622
+ if len(pitch) < 2 or len(t_pitch) < 2:
623
+ logging.warning(f'Insufficient pitch data: len(pitch)={len(pitch)}, len(t_pitch)={len(t_pitch)}')
624
+ # Create minimal pitch data
625
+ if len(pitch) < 2:
626
+ pitch = np.array([self.pitch, self.pitch])
627
+ if len(t_pitch) < 2:
628
+ t_pitch = np.array([0.0, 0.01])
629
+
630
+ pitch_interp = interp.Akima1DInterpolator(t_pitch, pitch)
631
+ pitch_render = pitch_interp(clip(t, 0, t_pitch[-1]))
632
+
633
+ logging.info('Checking flags.')
634
+ # Flag interpretation area
635
+ ### BEFORE HZ CONVERSION FLAGS ###
636
+ # Pitch offset flag
637
+ if 't' in self.flags.keys():
638
+ pitch_render += self.flags['t'] / 100
639
+
640
+ # Convert pitch to Hertz and add F0 offset for modulation
641
+ f0_render = midi_to_hz(pitch_render) + f0_off_render * mod
642
+
643
+ ### BEFORE RENDER FLAGS ###
644
+ # Vocal Fry flag
645
+ if 'fe' in self.flags.keys():
646
+ logging.info('Adding vocal fry.')
647
+ fry = self.flags['fe'] / 1000
648
+ fry_len = 0.075
649
+ fry_offset = 0
650
+ fry_pitch = f0_floor
651
+ if 'fl' in self.flags.keys(): # check length flag
652
+ fry_len = max(self.flags['fl'] / 1000, 0.001)
653
+
654
+ if 'fo' in self.flags.keys():
655
+ fry_offset = self.flags['fo'] / 1000
656
+
657
+ if 'fp' in self.flags.keys():
658
+ fry_pitch = max(self.flags['fp'], 0)
659
+
660
+ # Prepare envelope
661
+ t_fry = t - t[con] - fry_offset # temporal positions centered around the consonant shifted by offset
662
+ amt = smoothstep(-fry - fry_len / 2, -fry + fry_len / 2, t_fry) * smoothstep(fry_len / 2, -fry_len / 2, t_fry) #fry envelope
663
+
664
+ f0_render = f0_render * (1 - amt) + fry_pitch * amt # mix low F0 for fry
665
+
666
+ # Gender/Formant shift flag
667
+ if 'g' in self.flags.keys():
668
+ logging.info('Shifting formants.')
669
+ gender = np.exp2(self.flags['g'] / 120)
670
+
671
+ freq_x = np.linspace(0, 1, fft_size // 2 + 1) # map spectral envelope by frequency instead of time
672
+ sp_render_interp = interp.Akima1DInterpolator(freq_x, sp_render, axis=1)
673
+
674
+ # stretch spectral envelope depending on gender
675
+ freq_x = clip(np.linspace(0, gender, fft_size // 2 + 1), 0, 1) # clip axis because Akima1DInterpolator doesn't extrapolate (or even just extend)
676
+ sp_render = sp_render_interp(freq_x).copy(order='C')
677
+
678
+ # map unvoicedness (kinda like voisona huskiness)
679
+ husk = np.mean(ap_render, axis=1)
680
+
681
+ # Breathiness flag
682
+ if 'B' in self.flags.keys():
683
+ breath = self.flags['B']
684
+ if breath <= 50: # Raise power to flatten smaller areas and keep max aperiodicity
685
+ logging.info('Lowering breathiness.')
686
+ breath = breath / 100
687
+ ap_render = bias(ap_render, breath)
688
+ ap_render[np.isclose(husk, 1),:] = 1 # make sure unvoiced areas stay unvoiced... only happens if breathiness is 0 but too much if statements
689
+ else:
690
+ breath = 0
691
+
692
+ # Distortion flag
693
+ if 'D' in self.flags.keys():
694
+ logging.info('Adding distortion.')
695
+ distortion_amount = clip(self.flags['D'], 0, 100)
696
+ ap_render = ap_render * (distortion_amount / 10)
697
+ f0_render = f0_render + np.random.normal(0, distortion_amount, len(f0_render))
698
+
699
+ # Coarsness flag
700
+ if 'C' in self.flags.keys():
701
+ logging.info('Adding coarseness.')
702
+ coarseness = clip(self.flags['C'], 0, 100)
703
+ for i in range(len(f0_render)):
704
+ if i % 6 == 0:
705
+ f0_render[i] = 60
706
+
707
+ #Peak compressor flag
708
+ flag_peak = self.flags.get('P', 86)
709
+ peak = 1 - flag_peak / 100
710
+
711
+ if flag_peak > 0:
712
+ rms = np.sqrt(2 * np.sum(sp_render, axis=1) / fft_size ** 2 + 0.000001) # get RMS.. i'm not sure if this is right but i think it's fine
713
+ rms_peak = np.max(rms)
714
+ rms_norm = rms / (peak * rms_peak)
715
+
716
+ comp = np.zeros(rms_norm.shape)
717
+ comp[rms_norm >= 1] = rms_norm[rms_norm >= 1] - 1
718
+ comp = (1 - peak) * comp / np.max(comp)
719
+ comp = 1 - comp
720
+
721
+ comp = ndimage.gaussian_filter1d(comp, 6)
722
+
723
+ comp = np.vstack([comp] * sp_render.shape[1]).transpose()
724
+ sp_render *= comp
725
+ ap_render *= comp
726
+
727
+ # remove pitch in areas with max aperiodicity
728
+ f0_render[np.isclose(husk, 1)] = 0
729
+ render = world.synthesize(f0_render, sp_render, ap_render, default_fs)
730
+
731
+ ### AFTER RENDER FLAGS ###
732
+ # Max aperiodicity flag
733
+ if 'S' in self.flags.keys():
734
+ amt = clip(self.flags['S'] / 100, 0, 1)
735
+ render_ap = world.synthesize(f0_render, sp_render, np.ones(ap_render.shape), default_fs)
736
+ render = render * (1 - amt) + render_ap * amt
737
+
738
+ if breath > 50: # mix max breathiness signal
739
+ logging.info('Raising breathiness.')
740
+ breath = clip((breath - 50) / 50, 0, 1)
741
+ render_breath = world.synthesize(f0_render, sp_render * np.square(ap_render), np.ones(ap_render.shape), default_fs) # apply band AP on regular specgram, max out ap
742
+
743
+ render = render * (1 - breath) + render_breath * breath # Mix signals
744
+
745
+ t_sample = np.arange(len(render)) / default_fs # temporal position per sample
746
+ if 'fe' in self.flags.keys():
747
+ fry = self.flags['fe'] / 1000
748
+ fry_len = 0.05
749
+ fry_offset = 0
750
+ fry_vol = 0.1
751
+ if 'fl' in self.flags.keys(): # check length flag
752
+ fry_len = max(self.flags['fl'] / 1000, 0.001)
753
+
754
+ if 'fo' in self.flags.keys():
755
+ fry_offset = self.flags['fo'] / 1000
756
+
757
+ if 'fv' in self.flags.keys():
758
+ fry_vol = clip(self.flags['fv'] / 100, 0, 1)
759
+
760
+ # Prepare envelope
761
+ t_fry = t_sample - t[con] - fry_offset # temporal positions centered around the consonant shifted by offset
762
+ amt = smoothstep(-fry - fry_len / 2, -fry + fry_len / 2, t_fry) * smoothstep(fry_len / 2, -fry_len / 2, t_fry) #fry envelope
763
+ env = 1 - amt + fry_vol * amt
764
+
765
+ render_hp = highpass(render, cutoff=300) # add a highpass through the fry area
766
+ render = render * (1 - amt) + render_hp * amt
767
+ render *= env
768
+
769
+ # Fix voicing flag
770
+ if 've' in self.flags.keys():
771
+ logging.info('Fixing voicing.')
772
+ end_breath = self.flags['ve'] / 1000
773
+ render_breath = world.synthesize(f0_render, sp_render * np.square(ap_render), np.ones(ap_render.shape), default_fs) # apply band AP on regular specgram, max out ap
774
+
775
+ offset = 0
776
+ if 'vo' in self.flags.keys(): # check offset flag
777
+ offset = self.flags['vo'] / 1000
778
+ logging.info(offset)
779
+
780
+ amt = smoothstep(-end_breath / 2, end_breath / 2, t_sample - t[con] - offset) # smoothstep with consonant at 0.5
781
+ render = render * (1 - amt) + render_breath * amt # mix sample based on envelope
782
+
783
+ normalize = self.flags.get('p', 6)
784
+
785
+ if normalize >= 0:
786
+ normal = render / np.max(render)
787
+ render = normal * (10 ** (-normalize / 20))
788
+
789
+ ### AFTER PEAK NORMALIZATION ###
790
+ # Tremolo flag
791
+ if 'A' in self.flags.keys():
792
+ logging.info('Adding tremolo.')
793
+ tremolo = self.flags['A'] / 100
794
+
795
+ pitch_sample = pitch_interp(clip(t_sample, 0, t_pitch[-1])) # probably bad because of how low the sampling rate is for the pitch
796
+ pitch_smooth = lowpass(pitch_sample, cutoff=8, order=16)
797
+ vibrato = highpass(pitch_smooth, cutoff=4, order=16)
798
+
799
+ amt = np.maximum(tremolo * vibrato + 1, 0)
800
+ render = render * amt
801
+ # Growl flag
802
+ if 'R' in self.flags.keys():
803
+ logging.info('Adding tremolo growl flag.')
804
+ depth = clip(self.flags['R'] / 100, 0, 1)
805
+
806
+ rate = 75
807
+
808
+ time = np.arange(len(render)) / default_fs
809
+ sine_wave = np.sin(2 * np.pi * rate * time)
810
+
811
+ render = render * (2 - depth * sine_wave) / 2
812
+
813
+ render *= vol # volume
814
+ save_wav(self.out_file, render)
815
+
816
+ if __name__ == '__main__':
817
+ logging.info(f'straycat {version}')
818
+ try:
819
+ Resampler(*sys.argv[1:])
820
+ except Exception as e:
821
+ name = e.__class__.__name__
822
+ if name == 'TypeError':
823
+ logging.info(help_string)
824
+ else:
825
+ raise e
test_compressed_voicebank.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ 압축된 보이스뱅크 테스트 스크립트
4
+ 압축 전후의 성능과 정확성을 비교합니다.
5
+ """
6
+
7
+ import time
8
+ import numpy as np
9
+ from pathlib import Path
10
+ from utau_engine import UTAUEngine
11
+ from compressed_utau_engine import CompressedUTAUEngine
12
+ import logging
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ def test_voicebank_comparison():
18
+ """원본과 압축된 보이스뱅크 비교 테스트"""
19
+
20
+ # 경로 설정
21
+ original_path = "voice/hanseol CVC"
22
+ compressed_path = "voice/hanseol_CVC_compressed.h5"
23
+
24
+ print("🔍 보이스뱅크 비교 테스트 시작")
25
+ print("=" * 50)
26
+
27
+ # 압축된 보이스뱅크가 없으면 종료
28
+ if not Path(compressed_path).exists():
29
+ print("❌ 압축된 보이스뱅크를 찾을 수 없습니다.")
30
+ print("먼저 voice_data_converter.py를 실행하세요.")
31
+ return
32
+
33
+ # 1. 로딩 시간 비교
34
+ print("\n📊 1. 로딩 시간 비교")
35
+
36
+ # 원본 로딩
37
+ if Path(original_path).exists():
38
+ start_time = time.time()
39
+ try:
40
+ original_engine = UTAUEngine(original_path)
41
+ original_load_time = time.time() - start_time
42
+ print(f" 원본 보이스뱅크 로딩: {original_load_time:.2f}초")
43
+ except Exception as e:
44
+ print(f" 원본 로딩 실패: {e}")
45
+ original_engine = None
46
+ original_load_time = float('inf')
47
+ else:
48
+ print(" 원본 보이스뱅크 없음")
49
+ original_engine = None
50
+ original_load_time = float('inf')
51
+
52
+ # 압축된 버전 로딩
53
+ start_time = time.time()
54
+ try:
55
+ compressed_engine = CompressedUTAUEngine(compressed_path)
56
+ compressed_load_time = time.time() - start_time
57
+ print(f" 압축된 보이스뱅크 로딩: {compressed_load_time:.2f}초")
58
+
59
+ if original_load_time != float('inf'):
60
+ speedup = original_load_time / compressed_load_time
61
+ print(f" 로딩 속도 개선: {speedup:.1f}배")
62
+
63
+ except Exception as e:
64
+ print(f" 압축된 버전 로딩 실패: {e}")
65
+ return
66
+
67
+ # 2. 메모리 사용량 및 압축 정보
68
+ print("\n📊 2. 압축 정보")
69
+ compression_info = compressed_engine.get_compression_info()
70
+ print(f" 원본 크기: {compression_info['original_size_mb']:.1f} MB")
71
+ print(f" 압축 크기: {compression_info['compressed_size_mb']:.1f} MB")
72
+ print(f" 압축율: {compression_info['compression_ratio']:.1f}%")
73
+
74
+ # 3. 음소 수 비교
75
+ print("\n📊 3. 음소 정보")
76
+ compressed_phonemes = compressed_engine.get_available_phonemes()
77
+ print(f" 압축된 버전 음소 수: {len(compressed_phonemes)}개")
78
+
79
+ if original_engine:
80
+ original_phonemes = original_engine.get_available_phonemes()
81
+ print(f" 원본 음소 수: {len(original_phonemes)}개")
82
+
83
+ # 음소 일치도 확인
84
+ original_set = set(original_phonemes)
85
+ compressed_set = set(compressed_phonemes)
86
+ match_rate = len(original_set & compressed_set) / len(original_set) * 100
87
+ print(f" 음소 일치도: {match_rate:.1f}%")
88
+
89
+ # 4. 합성 테스트
90
+ print("\n📊 4. 합성 성능 테스트")
91
+
92
+ # 테스트용 노트 시퀀스
93
+ test_notes = [
94
+ {
95
+ "pitch": 60, # C4
96
+ "startSeconds": 0.0,
97
+ "durationSeconds": 0.5,
98
+ "endSeconds": 0.5,
99
+ "velocity": 100
100
+ },
101
+ {
102
+ "pitch": 64, # E4
103
+ "startSeconds": 0.5,
104
+ "durationSeconds": 0.5,
105
+ "endSeconds": 1.0,
106
+ "velocity": 100
107
+ },
108
+ {
109
+ "pitch": 67, # G4
110
+ "startSeconds": 1.0,
111
+ "durationSeconds": 0.5,
112
+ "endSeconds": 1.5,
113
+ "velocity": 100
114
+ }
115
+ ]
116
+
117
+ test_lyrics = ["도", "미", "솔"]
118
+
119
+ # 압축된 버전 합성 테스트
120
+ start_time = time.time()
121
+ compressed_result, compressed_status = compressed_engine.synthesize_sequence(
122
+ test_notes, test_lyrics, tempo=120, volume=100
123
+ )
124
+ compressed_synth_time = time.time() - start_time
125
+
126
+ print(f" 압축된 버전 합성 시간: {compressed_synth_time:.2f}초")
127
+ print(f" 압축된 버전 상태: {compressed_status}")
128
+
129
+ # 원본 합성 테스트 (있을 경우)
130
+ if original_engine:
131
+ start_time = time.time()
132
+ original_result, original_status = original_engine.synthesize_sequence(
133
+ test_notes, test_lyrics, tempo=120, volume=100
134
+ )
135
+ original_synth_time = time.time() - start_time
136
+
137
+ print(f" 원본 합성 시간: {original_synth_time:.2f}초")
138
+ print(f" 원본 상태: {original_status}")
139
+
140
+ if original_synth_time > 0:
141
+ speedup = original_synth_time / compressed_synth_time
142
+ print(f" 합성 속도 개선: {speedup:.1f}배")
143
+
144
+ # 5. 권장사항
145
+ print("\n💡 5. 권장사항")
146
+ print(" ✅ HDF5 압축 방식의 장점:")
147
+ print(" - 단일 파일로 관리 용이")
148
+ print(" - 높은 압축율로 저장공간 절약")
149
+ print(" - 빠른 랜덤 액세스")
150
+ print(" - Hugging Face Spaces 최적화")
151
+ print(" ✅ Gradio/HF Spaces 배포 시:")
152
+ print(" - 압축된 .h5 파일만 업로드")
153
+ print(" - 원본 WAV 파일들은 제외")
154
+ print(" - 빠른 앱 시작 시간")
155
+ print(" - 낮은 스토리지 비용")
156
+
157
+ if __name__ == "__main__":
158
+ test_voicebank_comparison()
utau_engine.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import logging
4
+ import tempfile
5
+ import numpy as np
6
+ import soundfile as sf
7
+ from pathlib import Path
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+ from dataclasses import dataclass
10
+ from straycat import Resampler
11
+
12
+ # 로깅 설정
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ @dataclass
17
+ class OtoEntry:
18
+ """UTAU oto.ini 엔트리 클래스"""
19
+ filename: str # WAV 파일명
20
+ alias: str # 별명 (발음)
21
+ offset: float # 오프셋 (ms)
22
+ consonant: float # 자음 길이 (ms)
23
+ cutoff: float # 컷오프 (ms)
24
+ preutterance: float # 프리유터런스 (ms)
25
+ overlap: float # 오버랩 (ms)
26
+
27
+ @property
28
+ def is_breath(self) -> bool:
29
+ """숨소리/무음 구간인지 확인"""
30
+ return self.alias.startswith('-') or self.alias.startswith('*')
31
+
32
+ @property
33
+ def clean_alias(self) -> str:
34
+ """접두사 제거된 순수 별명"""
35
+ alias = self.alias
36
+ if alias.startswith('- '):
37
+ return alias[2:]
38
+ elif alias.startswith('* '):
39
+ return alias[2:]
40
+ elif alias.startswith('-'):
41
+ return alias[1:]
42
+ elif alias.startswith('*'):
43
+ return alias[1:]
44
+ return alias
45
+
46
+ class VoicebankManager:
47
+ """UTAU 보이스뱅크 관리 클래스"""
48
+
49
+ def __init__(self, voicebank_path: Union[str, Path]):
50
+ self.voicebank_path = Path(voicebank_path)
51
+ self.oto_entries: Dict[str, OtoEntry] = {}
52
+ self.wav_files: Dict[str, Path] = {}
53
+ self.load_voicebank()
54
+
55
+ def load_voicebank(self):
56
+ """보이스뱅크 로드"""
57
+ if not self.voicebank_path.exists():
58
+ raise FileNotFoundError(f"보이스뱅크 경로를 찾을 수 없습니다: {self.voicebank_path}")
59
+
60
+ # oto.ini 파일 찾기
61
+ oto_file = self.voicebank_path / "oto.ini"
62
+ if not oto_file.exists():
63
+ raise FileNotFoundError(f"oto.ini 파일을 찾을 수 없습니다: {oto_file}")
64
+
65
+ # WAV 파일들 인덱싱
66
+ self._index_wav_files()
67
+
68
+ # oto.ini 파싱
69
+ self._parse_oto_ini(oto_file)
70
+
71
+ logger.info(f"보이스뱅크 로드 완료: {len(self.oto_entries)}개 엔트리, {len(self.wav_files)}개 WAV 파일")
72
+
73
+ def _index_wav_files(self):
74
+ """WAV 파일들 인덱싱"""
75
+ for wav_file in self.voicebank_path.glob("*.wav"):
76
+ self.wav_files[wav_file.name] = wav_file
77
+
78
+ # 하위 폴더도 검색
79
+ for subfolder in self.voicebank_path.iterdir():
80
+ if subfolder.is_dir():
81
+ for wav_file in subfolder.glob("*.wav"):
82
+ self.wav_files[wav_file.name] = wav_file
83
+
84
+ def _parse_oto_ini(self, oto_file: Path):
85
+ """oto.ini 파일 파싱"""
86
+ try:
87
+ # 다양한 인코딩으로 시도
88
+ encodings = ['shift_jis', 'utf-8', 'cp932', 'euc-jp']
89
+ content = None
90
+
91
+ for encoding in encodings:
92
+ try:
93
+ with open(oto_file, 'r', encoding=encoding) as f:
94
+ content = f.read()
95
+ logger.info(f"oto.ini를 {encoding} 인코딩으로 읽었습니다.")
96
+ break
97
+ except UnicodeDecodeError:
98
+ continue
99
+
100
+ if content is None:
101
+ raise Exception("oto.ini 파일을 읽을 수 없습니다. 인코딩 문제가 있을 수 있습니다.")
102
+
103
+ # 각 라인 파싱
104
+ for line_num, line in enumerate(content.strip().split('\n'), 1):
105
+ line = line.strip()
106
+ if not line or line.startswith('#'):
107
+ continue
108
+
109
+ try:
110
+ self._parse_oto_line(line)
111
+ except Exception as e:
112
+ logger.warning(f"oto.ini {line_num}번째 줄 파싱 실패: {e}")
113
+ continue
114
+
115
+ except Exception as e:
116
+ logger.error(f"oto.ini 파싱 실패: {e}")
117
+ raise
118
+
119
+ def _parse_oto_line(self, line: str):
120
+ """oto.ini 한 줄 파싱"""
121
+ # 형식: filename=alias,offset,consonant,cutoff,preutterance,overlap
122
+ if '=' not in line:
123
+ return
124
+
125
+ filename, params = line.split('=', 1)
126
+ parts = params.split(',')
127
+
128
+ if len(parts) != 6:
129
+ logger.warning(f"잘못된 oto.ini 형식: {line}")
130
+ return
131
+
132
+ try:
133
+ alias = parts[0]
134
+ offset = float(parts[1])
135
+ consonant = float(parts[2])
136
+ cutoff = float(parts[3])
137
+ preutterance = float(parts[4])
138
+ overlap = float(parts[5])
139
+
140
+ entry = OtoEntry(
141
+ filename=filename,
142
+ alias=alias,
143
+ offset=offset,
144
+ consonant=consonant,
145
+ cutoff=cutoff,
146
+ preutterance=preutterance,
147
+ overlap=overlap
148
+ )
149
+
150
+ self.oto_entries[alias] = entry
151
+
152
+ except ValueError as e:
153
+ logger.warning(f"oto.ini 파라미터 파싱 실패: {line} - {e}")
154
+
155
+ def get_sample_for_phoneme(self, phoneme: str) -> Optional[OtoEntry]:
156
+ """음소에 해당하는 샘플 찾기"""
157
+ # 정확한 매치 먼저 시도
158
+ if phoneme in self.oto_entries:
159
+ return self.oto_entries[phoneme]
160
+
161
+ # 유사한 발음 찾기
162
+ candidates = []
163
+ for alias in self.oto_entries:
164
+ entry = self.oto_entries[alias]
165
+ if entry.clean_alias == phoneme:
166
+ candidates.append(entry)
167
+
168
+ if candidates:
169
+ # 숨소리가 아닌 것을 우선
170
+ non_breath = [c for c in candidates if not c.is_breath]
171
+ return non_breath[0] if non_breath else candidates[0]
172
+
173
+ return None
174
+
175
+ def get_wav_path(self, filename: str) -> Optional[Path]:
176
+ """WAV 파일 경로 가져오기"""
177
+ return self.wav_files.get(filename)
178
+
179
+ def list_available_phonemes(self) -> List[str]:
180
+ """사용 가능한 음소 목록"""
181
+ return list(set(entry.clean_alias for entry in self.oto_entries.values()))
182
+
183
+ class UTAUEngine:
184
+ """UTAU 호환 음성 합성 엔진"""
185
+
186
+ def __init__(self, voicebank_path: Union[str, Path]):
187
+ self.voicebank = VoicebankManager(voicebank_path)
188
+ self.default_phoneme = "あ" # 기본 음소
189
+
190
+ def synthesize_sequence(self,
191
+ notes: List[Dict],
192
+ lyrics: List[str],
193
+ tempo: int = 120,
194
+ volume: int = 100) -> Tuple[Optional[str], str]:
195
+ """노트 시퀀스와 가사로 음성 합성"""
196
+
197
+ if len(notes) != len(lyrics):
198
+ return None, "노트와 가사의 개수가 일치하지 않습니다."
199
+
200
+ if not notes:
201
+ return None, "노트가 없습니다."
202
+
203
+ try:
204
+ # 전체 길이 계산 - 초 단위로 계산
205
+ max_end_time_seconds = max(note.get('endSeconds', note.get('startSeconds', 0) + note.get('durationSeconds', 0.5)) for note in notes)
206
+ max_end_time = max_end_time_seconds * 1000 # 밀리초로 변환
207
+ sample_rate = 44100
208
+ total_samples = int(max_end_time * sample_rate / 1000) + sample_rate
209
+ final_audio = np.zeros(total_samples)
210
+
211
+ synthesized_count = 0
212
+
213
+ for i, (note, lyric) in enumerate(zip(notes, lyrics)):
214
+ try:
215
+ # 음소로 변환 (간단한 일본어 음소 매핑)
216
+ phoneme = self._lyric_to_phoneme(lyric)
217
+
218
+ # 보이스뱅크에서 샘플 찾기
219
+ oto_entry = self.voicebank.get_sample_for_phoneme(phoneme)
220
+ if not oto_entry:
221
+ logger.warning(f"음소 '{phoneme}' (가사: '{lyric}')에 해당하는 샘플을 찾을 수 없습니다.")
222
+ continue
223
+
224
+ # WAV 파일 경로
225
+ wav_path = self.voicebank.get_wav_path(oto_entry.filename)
226
+ if not wav_path or not wav_path.exists():
227
+ logger.warning(f"WAV 파일을 찾을 수 없습니다: {oto_entry.filename}")
228
+ continue
229
+
230
+ # 음성 합성
231
+ synth_audio = self._synthesize_note(note, oto_entry, wav_path, tempo, volume)
232
+ if synth_audio is not None:
233
+ # 오디오 믹싱
234
+ start_sample = int(note.get('startSeconds', 0) * sample_rate) # 초 단위를 샘플로 변환
235
+ end_sample = start_sample + len(synth_audio)
236
+
237
+ if end_sample <= len(final_audio):
238
+ final_audio[start_sample:end_sample] += synth_audio
239
+ else:
240
+ # 버퍼 확장
241
+ new_size = end_sample + sample_rate
242
+ new_final_audio = np.zeros(new_size)
243
+ new_final_audio[:len(final_audio)] = final_audio
244
+ new_final_audio[start_sample:end_sample] += synth_audio
245
+ final_audio = new_final_audio
246
+
247
+ synthesized_count += 1
248
+ logger.info(f"노트 {i+1} 합성 완료: {lyric} -> {phoneme}")
249
+
250
+ except Exception as e:
251
+ logger.error(f"노트 {i+1} 합성 실패: {e}")
252
+ continue
253
+
254
+ if synthesized_count == 0:
255
+ return None, "합성된 노트가 없습니다."
256
+
257
+ # 최종 오디오 정규화
258
+ if np.max(np.abs(final_audio)) > 0:
259
+ final_audio = final_audio / np.max(np.abs(final_audio)) * 0.8
260
+
261
+ # 파일 저장
262
+ output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
263
+ sf.write(output_file.name, final_audio, sample_rate)
264
+ output_file.close()
265
+
266
+ duration_sec = len(final_audio) / sample_rate
267
+ return output_file.name, f"✅ UTAU 합성 완료: {synthesized_count}/{len(notes)}개 노트, {duration_sec:.1f}초"
268
+
269
+ except Exception as e:
270
+ error_msg = f"❌ UTAU 합성 중 오류: {str(e)}"
271
+ logger.error(error_msg)
272
+ return None, error_msg
273
+
274
+ def _lyric_to_phoneme(self, lyric: str) -> str:
275
+ """가사를 음소로 변환 (한국어 + 일본어 매핑)"""
276
+ # 공백 제거
277
+ lyric = lyric.strip()
278
+
279
+ # 빈 가사면 기본값 반환
280
+ if not lyric:
281
+ return self.default_phoneme
282
+
283
+ # 한국어 음소 매핑 (hanseol CVC용)
284
+ korean_map = {
285
+ # 기본 모음
286
+ '아': 'a', '이': 'i', '우': 'u', '에': 'e', '오': 'o', '으': 'eu', '어': 'eo',
287
+ # 기본 자음+모음
288
+ '바': 'ba', '비': 'bi', '부': 'bu', '베': 'be', '보': 'bo', '브': 'beu', '버': 'beo',
289
+ '다': 'da', '디': 'di', '두': 'du', '데': 'de', '도': 'do', '드': 'deu', '더': 'deo',
290
+ '가': 'ga', '기': 'gi', '구': 'gu', '게': 'ge', '고': 'go', '그': 'geu', '거': 'geo',
291
+ '하': 'ha', '히': 'hi', '후': 'hu', '헤': 'he', '호': 'ho', '흐': 'heu', '허': 'heo',
292
+ '자': 'ja', '지': 'ji', '주': 'ju', '제': 'je', '조': 'jo', '즈': 'jeu', '저': 'jeo',
293
+ '카': 'ka', '키': 'ki', '쿠': 'ku', '케': 'ke', '코': 'ko', '크': 'keu', '커': 'keo',
294
+ '라': 'la', '리': 'li', '루': 'lu', '레': 'le', '로': 'lo', '르': 'leu', '러': 'leo',
295
+ '마': 'ma', '미': 'mi', '무': 'mu', '메': 'me', '모': 'mo', '므': 'meu', '머': 'meo',
296
+ '나': 'na', '니': 'ni', '누': 'nu', '네': 'ne', '노': 'no', '느': 'neu', '너': 'neo',
297
+ '파': 'pa', '피': 'pi', '푸': 'pu', '페': 'pe', '포': 'po', '프': 'peu', '퍼': 'peo',
298
+ '라': 'ra', '리': 'ri', '루': 'ru', '레': 're', '로': 'ro', '르': 'reu', '러': 'reo',
299
+ '사': 'sa', '시': 'si', '수': 'su', '세': 'se', '소': 'so', '스': 'seu', '서': 'seo',
300
+ '타': 'ta', '티': 'ti', '투': 'tu', '테': 'te', '토': 'to', '트': 'teu', '터': 'teo',
301
+ # 복합모음
302
+ '야': 'ya', '예': 'ye', '여': 'yeo', '요': 'yo', '유': 'yu', '의': 'eui',
303
+ '와': 'wa', '웨': 'we', '위': 'wi', '워': 'weo',
304
+ # 기타 한국어 도레미
305
+ '도': 'do', '레': 're', '미': 'mi', '파': 'fa', '솔': 'so', '라': 'la', '시': 'si'
306
+ }
307
+
308
+ # 한국어 매핑 시도
309
+ if lyric in korean_map:
310
+ return korean_map[lyric]
311
+
312
+ # 로마자 -> 히라가나 변환 (일본어)
313
+ romaji_map = {
314
+ 'a': 'あ', 'i': 'い', 'u': 'う', 'e': 'え', 'o': 'お',
315
+ 'ka': 'か', 'ki': 'き', 'ku': 'く', 'ke': 'け', 'ko': 'こ',
316
+ 'sa': 'さ', 'shi': 'し', 'su': 'す', 'se': 'せ', 'so': 'そ',
317
+ 'ta': 'た', 'chi': 'ち', 'tsu': 'つ', 'te': 'て', 'to': 'と',
318
+ 'na': 'な', 'ni': 'に', 'nu': 'ぬ', 'ne': 'ね', 'no': 'の',
319
+ 'ha': 'は', 'hi': 'ひ', 'fu': 'ふ', 'he': 'へ', 'ho': 'ほ',
320
+ 'ma': 'ま', 'mi': 'み', 'mu': 'む', 'me': 'め', 'mo': 'も',
321
+ 'ya': 'や', 'yu': 'ゆ', 'yo': 'よ',
322
+ 'ra': 'ら', 'ri': 'り', 'ru': 'る', 're': 'れ', 'ro': 'ろ',
323
+ 'wa': 'わ', 'wo': 'を', 'n': 'ん'
324
+ }
325
+
326
+ # 로마자 변환 시도
327
+ lyric_lower = lyric.lower()
328
+ if lyric_lower in romaji_map:
329
+ return romaji_map[lyric_lower]
330
+
331
+ # 이미 음소인 경우 (hanseol CVC 직접 입력)
332
+ available_phonemes = self.voicebank.list_available_phonemes()
333
+ if lyric in available_phonemes:
334
+ return lyric
335
+
336
+ # 기본값 반환
337
+ logger.warning(f"알 수 없는 가사: '{lyric}', 기본 음소 '{self.default_phoneme}' 사용")
338
+ return self.default_phoneme
339
+
340
+ def _synthesize_note(self,
341
+ note: Dict,
342
+ oto_entry: OtoEntry,
343
+ wav_path: Path,
344
+ tempo: int,
345
+ volume: int) -> Optional[np.ndarray]:
346
+ """개별 노트 합성"""
347
+ try:
348
+ # MIDI 노트를 노트 이름으로 변환
349
+ note_name = self._midi_to_note_name(note['pitch'])
350
+
351
+ # 노트 길이 검증 및 조정
352
+ min_duration = 200 # 최소 200ms
353
+ duration = max(note.get('durationSeconds', 0.5) * 1000, min_duration) # 초를 밀리초로 변환
354
+
355
+ # 임시 출력 파일
356
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
357
+ temp_output = temp_file.name
358
+
359
+ try:
360
+ # UTAU 파라미터 검증 및 조정
361
+ offset = max(oto_entry.offset, 0) # 음수 오프셋 방지
362
+ consonant = max(oto_entry.consonant if oto_entry.consonant > 0 else 50, 10) # 최소 10ms
363
+ cutoff = max(oto_entry.cutoff if oto_entry.cutoff > 0 else 0, 0)
364
+
365
+ # WAV 파일 길이 확인
366
+ try:
367
+ info = sf.info(wav_path)
368
+ wav_duration_ms = (info.frames / info.samplerate) * 1000
369
+
370
+ # 오프셋이 WAV 파일보다 긴 경우 조정
371
+ if offset >= wav_duration_ms - 100: # 100ms 여유
372
+ offset = max(0, wav_duration_ms - 200)
373
+ logger.warning(f"오프셋이 너무 큽니다. {offset}ms로 조정했습니다.")
374
+
375
+ # 자음 길이가 너무 긴 경우 조정
376
+ max_consonant = min(duration / 2, wav_duration_ms - offset - 50)
377
+ consonant = min(consonant, max_consonant)
378
+
379
+ except Exception as e:
380
+ logger.warning(f"WAV 파일 정보 확인 실패: {e}")
381
+
382
+ # 최소 길이 보장
383
+ if consonant < 10:
384
+ consonant = 10
385
+ if duration < consonant + 50:
386
+ duration = consonant + 50
387
+
388
+ logger.info(f"합성 파라미터: offset={offset:.1f}ms, consonant={consonant:.1f}ms, duration={duration:.1f}ms")
389
+
390
+ # straycat으로 합성
391
+ resampler = Resampler(
392
+ in_file=str(wav_path),
393
+ out_file=temp_output,
394
+ pitch=note_name,
395
+ velocity=note.get('velocity', 100),
396
+ length=duration,
397
+ volume=volume,
398
+ flags='',
399
+ offset=offset,
400
+ consonant=consonant,
401
+ cutoff=cutoff,
402
+ modulation=0,
403
+ tempo=f'!{tempo}'
404
+ )
405
+
406
+ # 합성된 오디오 로드
407
+ if os.path.exists(temp_output):
408
+ synth_audio, _ = sf.read(temp_output)
409
+
410
+ # 결과 검증
411
+ if len(synth_audio) == 0:
412
+ logger.warning("합성된 오디오가 비어있습니다.")
413
+ return None
414
+
415
+ return synth_audio
416
+ else:
417
+ logger.warning("합성 결과 파일이 생성되지 않았습니다.")
418
+ return None
419
+
420
+ except Exception as e:
421
+ logger.error(f"straycat 합성 실패: {e}")
422
+ return None
423
+
424
+ finally:
425
+ # 임시 파일 정리
426
+ if os.path.exists(temp_output):
427
+ try:
428
+ os.unlink(temp_output)
429
+ except:
430
+ pass
431
+
432
+ except Exception as e:
433
+ logger.error(f"노트 합성 실패: {e}")
434
+ return None
435
+
436
+ return None
437
+
438
+ def _midi_to_note_name(self, midi_note: int) -> str:
439
+ """MIDI 노트를 노트 이름으로 변환"""
440
+ notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
441
+ octave = (midi_note // 12) - 1
442
+ note = notes[midi_note % 12]
443
+ return f"{note}{octave}"
444
+
445
+ def get_available_phonemes(self) -> List[str]:
446
+ """사용 가능한 음소 목록 반환"""
447
+ return self.voicebank.list_available_phonemes()
448
+
449
+ # 테스트 함수
450
+ def test_utau_engine():
451
+ """UTAU 엔진 테스트"""
452
+ try:
453
+ voicebank_path = "voice/hanseol CVC"
454
+ engine = UTAUEngine(voicebank_path)
455
+
456
+ print(f"hanseol CVC 보이스뱅크 로드 완료!")
457
+ print(f"사용 가능한 음소: {len(engine.get_available_phonemes())}개")
458
+ print(f"첫 10개 음소: {engine.get_available_phonemes()[:10]}")
459
+
460
+ return engine
461
+
462
+ except Exception as e:
463
+ print(f"UTAU 엔진 테스트 실패: {e}")
464
+ return None
465
+
466
+ if __name__ == "__main__":
467
+ test_utau_engine()
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
voice/hanseol_CVC_compressed.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1c9292e72186f3701d906307e393bb8ca6b0342cde6e883caae513daf2eff61
3
+ size 35548060
voice/test_voice.sc.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dd4f9cdc03422f3febdf9787cb6f41592b51b11ff9ced53e7baf67e844b5858
3
+ size 108682
voice/test_voice.wav ADDED
Binary file (88.2 kB). View file
 
voice_data_converter.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import h5py
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from pathlib import Path
5
+ import logging
6
+ import json
7
+ import gzip
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+ import shutil
10
+ from utau_engine import VoicebankManager, OtoEntry
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class VoiceDataCompressor:
15
+ """음성 데이터를 HDF5 형태로 압축/관리하는 클래스"""
16
+
17
+ def __init__(self, output_path: str = "voice_data.h5"):
18
+ self.output_path = Path(output_path)
19
+ self.compression = 'gzip' # 압축 알고리즘
20
+ self.compression_opts = 6 # 압축 레벨 (0-9)
21
+
22
+ def convert_voicebank_to_hdf5(self, voicebank_path: Union[str, Path]) -> bool:
23
+ """보이스뱅크를 HDF5 형태로 변환"""
24
+ try:
25
+ voicebank_path = Path(voicebank_path)
26
+ logger.info(f"보이스뱅크 변환 시작: {voicebank_path}")
27
+
28
+ # VoicebankManager로 기존 데이터 로드
29
+ vb_manager = VoicebankManager(voicebank_path)
30
+
31
+ # HDF5 파일 생성
32
+ with h5py.File(self.output_path, 'w') as h5file:
33
+ # 메타데이터 그룹
34
+ meta_group = h5file.create_group('metadata')
35
+
36
+ # oto.ini 정보 저장
37
+ oto_data = {}
38
+ for alias, entry in vb_manager.oto_entries.items():
39
+ oto_data[alias] = {
40
+ 'filename': entry.filename,
41
+ 'alias': entry.alias,
42
+ 'offset': entry.offset,
43
+ 'consonant': entry.consonant,
44
+ 'cutoff': entry.cutoff,
45
+ 'preutterance': entry.preutterance,
46
+ 'overlap': entry.overlap
47
+ }
48
+
49
+ # JSON으로 직렬화하여 압축 저장
50
+ oto_json = json.dumps(oto_data, ensure_ascii=False, indent=2)
51
+ oto_compressed = gzip.compress(oto_json.encode('utf-8'))
52
+ meta_group.create_dataset('oto_data', data=np.frombuffer(oto_compressed, dtype=np.uint8))
53
+
54
+ # 보이스뱅크 정보
55
+ meta_group.attrs['voicebank_name'] = voicebank_path.name
56
+ meta_group.attrs['total_entries'] = len(vb_manager.oto_entries)
57
+ meta_group.attrs['total_wav_files'] = len(vb_manager.wav_files)
58
+
59
+ # 오디오 데이터 그룹
60
+ audio_group = h5file.create_group('audio_data')
61
+
62
+ # 각 WAV 파일 처리
63
+ processed_files = set()
64
+ total_original_size = 0
65
+
66
+ for filename, wav_path in vb_manager.wav_files.items():
67
+ if filename in processed_files:
68
+ continue
69
+
70
+ try:
71
+ # 오디오 로드
72
+ audio_data, sample_rate = sf.read(wav_path)
73
+
74
+ # 파일 크기 계산
75
+ total_original_size += wav_path.stat().st_size
76
+
77
+ # 스테레오 → 모노 변환
78
+ if len(audio_data.shape) > 1:
79
+ audio_data = np.mean(audio_data, axis=1)
80
+
81
+ # 파일별 그룹 생성
82
+ file_group = audio_group.create_group(filename.replace('.wav', ''))
83
+
84
+ # 오디오 데이터 저장 (압축 적용)
85
+ file_group.create_dataset(
86
+ 'audio',
87
+ data=audio_data.astype(np.float32),
88
+ compression=self.compression,
89
+ compression_opts=self.compression_opts,
90
+ shuffle=True, # 압축 효율 향상
91
+ fletcher32=True # 체크섬 추가
92
+ )
93
+
94
+ # 메타데이터 저장
95
+ file_group.attrs['sample_rate'] = sample_rate
96
+ file_group.attrs['duration'] = len(audio_data) / sample_rate
97
+ file_group.attrs['original_filename'] = filename
98
+
99
+ processed_files.add(filename)
100
+ logger.info(f"변환 완료: {filename} ({len(audio_data)} samples)")
101
+
102
+ except Exception as e:
103
+ logger.error(f"파일 처리 실패 {wav_path}: {e}")
104
+ continue
105
+
106
+ # 압축 통계
107
+ compressed_size = self.output_path.stat().st_size
108
+ compression_ratio = (1 - compressed_size / total_original_size) * 100
109
+
110
+ meta_group.attrs['original_size_bytes'] = total_original_size
111
+ meta_group.attrs['compressed_size_bytes'] = compressed_size
112
+ meta_group.attrs['compression_ratio_percent'] = compression_ratio
113
+
114
+ logger.info(f"변환 완료!")
115
+ logger.info(f"원본 크기: {total_original_size / (1024*1024):.1f} MB")
116
+ logger.info(f"���축 크기: {compressed_size / (1024*1024):.1f} MB")
117
+ logger.info(f"압축율: {compression_ratio:.1f}%")
118
+
119
+ return True
120
+
121
+ except Exception as e:
122
+ logger.error(f"HDF5 변환 실패: {e}")
123
+ return False
124
+
125
+ class CompressedVoicebankManager:
126
+ """압축된 HDF5 보이스뱅크를 관리하는 클래스"""
127
+
128
+ def __init__(self, hdf5_path: Union[str, Path]):
129
+ self.hdf5_path = Path(hdf5_path)
130
+ self.oto_entries: Dict[str, OtoEntry] = {}
131
+ self._audio_cache: Dict[str, Tuple[np.ndarray, int]] = {}
132
+ self.cache_size_limit = 50 # 캐시할 최대 오디오 파일 수
133
+
134
+ if not self.hdf5_path.exists():
135
+ raise FileNotFoundError(f"압축된 보이스뱅크를 찾을 수 없습니다: {hdf5_path}")
136
+
137
+ self.load_metadata()
138
+
139
+ def load_metadata(self):
140
+ """HDF5에서 메타데이터 로드"""
141
+ try:
142
+ with h5py.File(self.hdf5_path, 'r') as h5file:
143
+ # oto.ini 데이터 로드
144
+ oto_compressed = h5file['metadata']['oto_data'][:]
145
+ oto_json = gzip.decompress(oto_compressed.tobytes()).decode('utf-8')
146
+ oto_data = json.loads(oto_json)
147
+
148
+ # OtoEntry 객체로 변환
149
+ for alias, data in oto_data.items():
150
+ self.oto_entries[alias] = OtoEntry(
151
+ filename=data['filename'],
152
+ alias=data['alias'],
153
+ offset=data['offset'],
154
+ consonant=data['consonant'],
155
+ cutoff=data['cutoff'],
156
+ preutterance=data['preutterance'],
157
+ overlap=data['overlap']
158
+ )
159
+
160
+ # 메타데이터 로그
161
+ meta = h5file['metadata']
162
+ logger.info(f"압축된 보이스뱅크 로드: {meta.attrs['voicebank_name']}")
163
+ logger.info(f"총 {meta.attrs['total_entries']}개 엔트리")
164
+ logger.info(f"압축율: {meta.attrs['compression_ratio_percent']:.1f}%")
165
+
166
+ except Exception as e:
167
+ logger.error(f"메타데이터 로드 실패: {e}")
168
+ raise
169
+
170
+ def get_audio_data(self, filename: str) -> Optional[Tuple[np.ndarray, int]]:
171
+ """특정 파일의 오디오 데이터 로드 (캐싱 지원)"""
172
+ base_filename = filename.replace('.wav', '')
173
+
174
+ # 캐시 확인
175
+ if base_filename in self._audio_cache:
176
+ return self._audio_cache[base_filename]
177
+
178
+ try:
179
+ with h5py.File(self.hdf5_path, 'r') as h5file:
180
+ if base_filename not in h5file['audio_data']:
181
+ return None
182
+
183
+ file_group = h5file['audio_data'][base_filename]
184
+ audio_data = file_group['audio'][:]
185
+ sample_rate = file_group.attrs['sample_rate']
186
+
187
+ # 캐시 관리 (LRU 방식)
188
+ if len(self._audio_cache) >= self.cache_size_limit:
189
+ # 가장 오래된 항목 제거
190
+ oldest_key = next(iter(self._audio_cache))
191
+ del self._audio_cache[oldest_key]
192
+
193
+ # 캐시에 저장
194
+ result = (audio_data, int(sample_rate))
195
+ self._audio_cache[base_filename] = result
196
+
197
+ return result
198
+
199
+ except Exception as e:
200
+ logger.error(f"오디오 데이터 로드 실패 {filename}: {e}")
201
+ return None
202
+
203
+ def get_sample_for_phoneme(self, phoneme: str) -> Optional[OtoEntry]:
204
+ """음소에 해당하는 샘플 찾기 (기존 로직과 동일)"""
205
+ # 정확한 매치 먼저 시도
206
+ if phoneme in self.oto_entries:
207
+ return self.oto_entries[phoneme]
208
+
209
+ # 유사한 발음 찾기
210
+ candidates = []
211
+ for alias in self.oto_entries:
212
+ entry = self.oto_entries[alias]
213
+ if entry.clean_alias == phoneme:
214
+ candidates.append(entry)
215
+
216
+ if candidates:
217
+ # 숨소리가 아닌 것을 우선
218
+ non_breath = [c for c in candidates if not c.is_breath]
219
+ return non_breath[0] if non_breath else candidates[0]
220
+
221
+ return None
222
+
223
+ def list_available_phonemes(self) -> List[str]:
224
+ """사용 가능한 음소 목록"""
225
+ return list(set(entry.clean_alias for entry in self.oto_entries.values()))
226
+
227
+ def get_compression_info(self) -> Dict[str, any]:
228
+ """압축 정보 반환"""
229
+ try:
230
+ with h5py.File(self.hdf5_path, 'r') as h5file:
231
+ meta = h5file['metadata']
232
+ return {
233
+ 'voicebank_name': meta.attrs['voicebank_name'],
234
+ 'total_entries': meta.attrs['total_entries'],
235
+ 'original_size_mb': meta.attrs['original_size_bytes'] / (1024*1024),
236
+ 'compressed_size_mb': meta.attrs['compressed_size_bytes'] / (1024*1024),
237
+ 'compression_ratio': meta.attrs['compression_ratio_percent'],
238
+ 'file_path': str(self.hdf5_path)
239
+ }
240
+ except Exception as e:
241
+ logger.error(f"압축 정보 로드 실패: {e}")
242
+ return {}
243
+
244
+ def convert_voicebank_to_compressed_format(voicebank_path: str, output_path: str = None) -> bool:
245
+ """보이스뱅크를 압축 형태로 변환하는 편의 함수"""
246
+ if output_path is None:
247
+ voicebank_name = Path(voicebank_path).name.replace(' ', '_')
248
+ output_path = f"voice/{voicebank_name}_compressed.h5"
249
+
250
+ converter = VoiceDataCompressor(output_path)
251
+ return converter.convert_voicebank_to_hdf5(voicebank_path)
252
+
253
+ if __name__ == "__main__":
254
+ # 테스트용 변환
255
+ success = convert_voicebank_to_compressed_format("voice/hanseol CVC")
256
+ if success:
257
+ print("✅ 보이스뱅크 압축 변환 완료!")
258
+
259
+ # 압축된 버전 테스트
260
+ compressed_vb = CompressedVoicebankManager("voice/hanseol_CVC_compressed.h5")
261
+ print(f"📊 압축 정보: {compressed_vb.get_compression_info()}")
262
+ print(f"🎤 사용 가능한 음소: {len(compressed_vb.list_available_phonemes())}개")
263
+ else:
264
+ print("❌ 보이스뱅크 압축 실패!")