ewebspace commited on
Commit
2494838
·
verified ·
1 Parent(s): 9135ba5

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +22 -12
  2. app.py +240 -0
  3. requirements .txt +9 -0
  4. spaces.yml +5 -0
README.md CHANGED
@@ -1,13 +1,23 @@
1
- ---
2
- title: FinalVocal
3
- emoji: 💻
4
- colorFrom: yellow
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.47.2
8
- app_file: app.py
9
- pinned: false
10
- short_description: 'ref mastered '
11
- ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reference Voice Conversion (HF Space)
 
 
 
 
 
 
 
 
 
 
2
 
3
+ Convert any vocal to match a **reference voice** (tone color) with [OpenVoice v2] and optional vocal separation via [Demucs]. Built with Gradio for fast UX.
4
+
5
+ ## ✨ Features
6
+ - Upload **Reference** (clean 5–20 seconds) and **Track** or **Acapella**
7
+ - Optional **Demucs** stem separation to extract vocals from full mix
8
+ - Control **style strength**, **pitch**, **formant tilt**
9
+ - **Remix** converted vocal with instrumental and gain controls
10
+
11
+ ## 🚀 Deploy
12
+ 1. Create a new **Hugging Face Space** (Python + Gradio). Hardware: **GPU recommended**.
13
+ 2. Add these files (`app.py`, `requirements.txt`, `README.md`).
14
+ 3. (Optional) Add `spaces.yml`:
15
+ ```yaml
16
+ sdk: gradio
17
+ python_version: 3.10
18
+ resources:
19
+ accelerators: ["gpu"]
20
+ Commit & run. First build downloads models (~hundreds MB).
21
+ 🧪 Tips
22
+ Reference should be clean, dry (no heavy FX), mono is fine.
23
+ Better results
app.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ app.py
2
+ import os
3
+ import tempfile
4
+ import warnings
5
+ warnings.filterwarnings("ignore")
6
+
7
+ import gradio as gr
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import librosa
11
+
12
+ from huggingface_hub import snapshot_download
13
+
14
+ # ------------------------------
15
+ # Model bootstrap
16
+ # ------------------------------
17
+ MODEL_DIR = os.path.join(os.getcwd(), "models")
18
+ OPENVOICE_REPO = "myshell-ai/OpenVoiceV2"
19
+
20
+ os.makedirs(MODEL_DIR, exist_ok=True)
21
+
22
+ # Lazy import to speed up Space boot
23
+ _openvoice_loaded = False
24
+ _tone_converter = None
25
+ _content_extractor = None
26
+
27
+ _demucs_model = None
28
+
29
+ def _ensure_openvoice():
30
+ global _openvoice_loaded, _tone_converter, _content_extractor
31
+ if _openvoice_loaded:
32
+ return
33
+ # Download model snapshots into ./models/openvoice
34
+ local_dir = snapshot_download(repo_id=OPENVOICE_REPO, local_dir=os.path.join(MODEL_DIR, "openvoice"), local_dir_use_symlinks=False)
35
+
36
+ # OpenVoice v2 layout ships python modules; import after download
37
+ import sys
38
+ if local_dir not in sys.path:
39
+ sys.path.append(local_dir)
40
+
41
+ # Import OpenVoice components
42
+ try:
43
+ from openvoice import se_extractor
44
+ from openvoice.api import ToneColorConverter, ContentVec
45
+ except Exception:
46
+ # Fallback to module paths used in some snapshots
47
+ from tone_color_converter.api import ToneColorConverter
48
+ from contentvec.api import ContentVec
49
+ from se_extractor import se_extractor
50
+
51
+ # Init content extractor (HuBERT-like)
52
+ content_ckpt = os.path.join(local_dir, "checkpoints", "contentvec", "checkpoint.pth")
53
+ _content_extractor = ContentVec(content_ckpt)
54
+
55
+ # Init tone color converter
56
+ tcc_ckpt = os.path.join(local_dir, "checkpoints", "tone_color_converter", "checkpoint.pth")
57
+ _tone_converter = ToneColorConverter(tcc_ckpt, device=os.environ.get("DEVICE", "cuda" if gr.cuda.is_available() else "cpu"))
58
+
59
+ _openvoice_loaded = True
60
+
61
+
62
+ def _ensure_demucs():
63
+ global _demucs_model
64
+ if _demucs_model is not None:
65
+ return
66
+ from demucs.apply import apply_model
67
+ from demucs.pretrained import get_model
68
+ from demucs.audio import AudioFile
69
+ _demucs_model = {
70
+ "apply_model": apply_model,
71
+ "get_model": get_model,
72
+ "AudioFile": AudioFile,
73
+ }
74
+
75
+
76
+ def separate_vocals(wav_path, stem="vocals"):
77
+ """Return path to separated vocals and accompaniment using htdemucs."""
78
+ _ensure_demucs()
79
+ apply_model = _demucs_model["apply_model"]
80
+ get_model = _demucs_model["get_model"]
81
+ AudioFile = _demucs_model["AudioFile"]
82
+
83
+ model = get_model(name="htdemucs")
84
+ model.cpu()
85
+
86
+ with AudioFile(wav_path).read(streams=0, samplerate=44100, channels=2) as mix:
87
+ ref = mix
88
+ out = apply_model(model, ref, shifts=1, split=True, overlap=0.25)
89
+ sources = {name: out[idx] for idx, name in enumerate(model.sources)}
90
+
91
+ # Save stems
92
+ base = os.path.splitext(os.path.basename(wav_path))[0]
93
+ out_dir = tempfile.mkdtemp(prefix="stems_")
94
+ vocal_path = os.path.join(out_dir, f"{base}_vocals.wav")
95
+ inst_path = os.path.join(out_dir, f"{base}_inst.wav")
96
+
97
+ sf.write(vocal_path, sources["vocals"].T, 44100)
98
+ # Combine other stems for instrumental
99
+ inst = sum([v for k, v in sources.items() if k != "vocals"]) / (len(model.sources) - 1)
100
+ sf.write(inst_path, inst.T, 44100)
101
+ return vocal_path, inst_path
102
+
103
+
104
+ def load_audio(x, sr=44100, mono=True):
105
+ y, _sr = librosa.load(x, sr=sr, mono=mono)
106
+ return y, sr
107
+
108
+
109
+ def save_audio(y, sr):
110
+ path = tempfile.mktemp(suffix=".wav")
111
+ sf.write(path, y, sr)
112
+ return path
113
+
114
+
115
+ def match_length(a, b):
116
+ # Pad/trim a to match length of b
117
+ if len(a) < len(b):
118
+ a = np.pad(a, (0, len(b)-len(a)))
119
+ else:
120
+ a = a[:len(b)]
121
+ return a
122
+
123
+
124
+ def convert_voice(reference_wav, source_vocal_wav, style_strength=0.8, pitch_shift=0.0, formant_shift=0.0):
125
+ _ensure_openvoice()
126
+
127
+ # Load audio
128
+ ref, sr = load_audio(reference_wav, sr=16000, mono=True)
129
+ src, _ = load_audio(source_vocal_wav, sr=16000, mono=True)
130
+
131
+ # Extract content features from source
132
+ content = _content_extractor.extract(src, sr)
133
+
134
+ # Extract speaker embedding / tone color from reference
135
+ # OpenVoice ships an SE (speaker encoder) util; we mimic via API if exposed.
136
+ try:
137
+ from openvoice import se_extractor
138
+ se = se_extractor.get_se(reference_wav, device=_tone_converter.device)
139
+ except Exception:
140
+ # Some snapshots provide a function name get_se_wav
141
+ from se_extractor import get_se
142
+ se = get_se(reference_wav)
143
+
144
+ # Run tone color conversion
145
+ converted = _tone_converter.convert(content, se, style_strength=style_strength)
146
+
147
+ y = converted
148
+
149
+ # Optional pitch & formant adjustments (light touch)
150
+ if abs(pitch_shift) > 1e-3:
151
+ y = librosa.effects.pitch_shift(y.astype(np.float32), 16000, n_steps=pitch_shift)
152
+ if abs(formant_shift) > 1e-3:
153
+ # crude formant-esque EQ tilt using shelving filter via librosa
154
+ import scipy.signal as sps
155
+ w = 2 * np.pi * 1500 / 16000
156
+ b, a = sps.iirfilter(2, Wn=w/np.pi, btype='high', ftype='butter') if formant_shift > 0 else sps.iirfilter(2, Wn=w/np.pi, btype='low', ftype='butter')
157
+ y = sps.filtfilt(b, a, y)
158
+
159
+ out_path = save_audio(y, 16000)
160
+ return out_path
161
+
162
+
163
+ def process(reference, track, acapella=None, separate=False, style_strength=0.8, pitch_shift=0.0, formant_shift=0.0, remix=False, vocal_gain_db=0.0, inst_gain_db=0.0):
164
+ if reference is None:
165
+ raise gr.Error("Загрузите референс голоса (reference_wav)")
166
+
167
+ # Prepare vocals & instrumental
168
+ vocals_path = None
169
+ instrumental_path = None
170
+
171
+ if acapella is not None:
172
+ vocals_path = acapella
173
+ elif separate and track is not None:
174
+ vocals_path, instrumental_path = separate_vocals(track)
175
+ elif track is not None:
176
+ vocals_path = track
177
+ else:
178
+ raise gr.Error("Загрузите либо полный трек, либо акапеллу")
179
+
180
+ # Convert vocal
181
+ converted_vocal = convert_voice(reference, vocals_path, style_strength, pitch_shift, formant_shift)
182
+
183
+ if not remix:
184
+ return converted_vocal, None
185
+
186
+ # Remix back to instrumental (if missing, make silence)
187
+ if instrumental_path is None and track is not None and separate:
188
+ _, instrumental_path = separate_vocals(track)
189
+ if instrumental_path is None:
190
+ # create silent instrumental length matched to converted vocal
191
+ y, sr = load_audio(converted_vocal)
192
+ inst = np.zeros_like(y)
193
+ instrumental_path = save_audio(inst, sr)
194
+
195
+ cv, sr = load_audio(converted_vocal)
196
+ inst, isr = load_audio(instrumental_path)
197
+ if isr != sr:
198
+ inst = librosa.resample(inst, orig_sr=isr, target_sr=sr)
199
+
200
+ cv = match_length(cv, inst)
201
+ # apply gains
202
+ cv = cv * (10 ** (vocal_gain_db / 20.0))
203
+ inst = inst * (10 ** (inst_gain_db / 20.0))
204
+
205
+ mix = cv + inst
206
+ mix_path = save_audio(mix, sr)
207
+ return converted_vocal, mix_path
208
+
209
+
210
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
211
+ gr.Markdown("""
212
+ # 🎙️ Reference Voice Conversion
213
+ Загрузите **референс** голоса и **трек/акапеллу** — получайте конвертированный вокал под тембр референса. Опционально: разделение вокала (Demucs) и ремикс в инструментал.
214
+ """)
215
+ with gr.Row():
216
+ with gr.Column():
217
+ ref = gr.Audio(label="Reference Voice (clean, 5–20s)", type="filepath")
218
+ track = gr.Audio(label="Source Track (full mix)", type="filepath")
219
+ acap = gr.Audio(label="Source Acapella (optional)", type="filepath")
220
+ separate = gr.Checkbox(label="Разделить вокал Demucs", value=True)
221
+ remix = gr.Checkbox(label="Сделать финальный микс (вокал + инструментал)", value=True)
222
+ with gr.Column():
223
+ style = gr.Slider(0.0, 1.0, value=0.85, step=0.01, label="Сила стиля (тембр)")
224
+ pitch = gr.Slider(-6, 6, value=0, step=0.5, label="Pitch shift (полутонов)")
225
+ formant = gr.Slider(-1.0, 1.0, value=0.0, step=0.1, label="Formant tilt (экспериментально)")
226
+ vgain = gr.Slider(-12, 12, value=0, step=0.5, label="Гейн вокала (dB)")
227
+ igain = gr.Slider(-12, 12, value=0, step=0.5, label="Гейн инструментала (dB)")
228
+ btn = gr.Button("Convert")
229
+ with gr.Row():
230
+ out_vocal = gr.Audio(label="Converted Vocal", type="filepath")
231
+ out_mix = gr.Audio(label="Remix (Vocal + Instrumental)", type="filepath")
232
+
233
+ btn.click(
234
+ fn=process,
235
+ inputs=[ref, track, acap, separate, style, pitch, formant, remix, vgain, igain],
236
+ outputs=[out_vocal, out_mix]
237
+ )
238
+
239
+ if __name__ == "__main__":
240
+ demo.launch()
requirements .txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.36.1
2
+ huggingface_hub>=0.23.0
3
+ soundfile>=0.12.1
4
+ librosa>=0.10.1
5
+ numpy>=1.26.4
6
+ scipy>=1.11.4
7
+ torch>=2.1.0
8
+ openvoice==0.2.0 ; python_version>="3.10" # if available; otherwise models ship in repo
9
+ demucs>=4.0.1
spaces.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ```yaml
2
+ sdk: gradio
3
+ python_version: 3.10
4
+ resources:
5
+ accelerators: ["gpu"]