File size: 8,653 Bytes
bf92e0a
2494838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235

import os
import gradio as gr
import numpy as np
import librosa

from huggingface_hub import snapshot_download

# ------------------------------
# Model bootstrap
# ------------------------------
MODEL_DIR = os.path.join(os.getcwd(), "models")
OPENVOICE_REPO = "myshell-ai/OpenVoiceV2"

os.makedirs(MODEL_DIR, exist_ok=True)

# Lazy import to speed up Space boot
_openvoice_loaded = False
_tone_converter = None
_content_extractor = None

_demucs_model = None

def _ensure_openvoice():
   global _openvoice_loaded, _tone_converter, _content_extractor
   if _openvoice_loaded:
       return
   # Download model snapshots into ./models/openvoice
   local_dir = snapshot_download(repo_id=OPENVOICE_REPO, local_dir=os.path.join(MODEL_DIR, "openvoice"), local_dir_use_symlinks=False)

   # OpenVoice v2 layout ships python modules; import after download
   import sys
   if local_dir not in sys.path:
       sys.path.append(local_dir)

   # Import OpenVoice components
   try:
       from openvoice import se_extractor
       from openvoice.api import ToneColorConverter, ContentVec
   except Exception:
       # Fallback to module paths used in some snapshots
       from tone_color_converter.api import ToneColorConverter
       from contentvec.api import ContentVec
       from se_extractor import se_extractor

   # Init content extractor (HuBERT-like)
   content_ckpt = os.path.join(local_dir, "checkpoints", "contentvec", "checkpoint.pth")
   _content_extractor = ContentVec(content_ckpt)

   # Init tone color converter
   tcc_ckpt = os.path.join(local_dir, "checkpoints", "tone_color_converter", "checkpoint.pth")
   _tone_converter = ToneColorConverter(tcc_ckpt, device=os.environ.get("DEVICE", "cuda" if gr.cuda.is_available() else "cpu"))

   _openvoice_loaded = True


def _ensure_demucs():
   global _demucs_model
   if _demucs_model is not None:
       return
   from demucs.apply import apply_model
   from demucs.pretrained import get_model
   from demucs.audio import AudioFile
   _demucs_model = {
       "apply_model": apply_model,
       "get_model": get_model,
       "AudioFile": AudioFile,
   }


def separate_vocals(wav_path, stem="vocals"):
   """Return path to separated vocals and accompaniment using htdemucs."""
   _ensure_demucs()
   apply_model = _demucs_model["apply_model"]
   get_model = _demucs_model["get_model"]
   AudioFile = _demucs_model["AudioFile"]

   model = get_model(name="htdemucs")
   model.cpu()

   with AudioFile(wav_path).read(streams=0, samplerate=44100, channels=2) as mix:
       ref = mix
       out = apply_model(model, ref, shifts=1, split=True, overlap=0.25)
       sources = {name: out[idx] for idx, name in enumerate(model.sources)}

   # Save stems
   base = os.path.splitext(os.path.basename(wav_path))[0]
   out_dir = tempfile.mkdtemp(prefix="stems_")
   vocal_path = os.path.join(out_dir, f"{base}_vocals.wav")
   inst_path = os.path.join(out_dir, f"{base}_inst.wav")

   sf.write(vocal_path, sources["vocals"].T, 44100)
   # Combine other stems for instrumental
   inst = sum([v for k, v in sources.items() if k != "vocals"]) / (len(model.sources) - 1)
   sf.write(inst_path, inst.T, 44100)
   return vocal_path, inst_path


def load_audio(x, sr=44100, mono=True):
   y, _sr = librosa.load(x, sr=sr, mono=mono)
   return y, sr


def save_audio(y, sr):
   path = tempfile.mktemp(suffix=".wav")
   sf.write(path, y, sr)
   return path


def match_length(a, b):
   # Pad/trim a to match length of b
   if len(a) < len(b):
       a = np.pad(a, (0, len(b)-len(a)))
   else:
       a = a[:len(b)]
   return a


def convert_voice(reference_wav, source_vocal_wav, style_strength=0.8, pitch_shift=0.0, formant_shift=0.0):
   _ensure_openvoice()

   # Load audio
   ref, sr = load_audio(reference_wav, sr=16000, mono=True)
   src, _ = load_audio(source_vocal_wav, sr=16000, mono=True)

   # Extract content features from source
   content = _content_extractor.extract(src, sr)

   # Extract speaker embedding / tone color from reference
   # OpenVoice ships an SE (speaker encoder) util; we mimic via API if exposed.
   try:
       from openvoice import se_extractor
       se = se_extractor.get_se(reference_wav, device=_tone_converter.device)
   except Exception:
       # Some snapshots provide a function name get_se_wav
       from se_extractor import get_se
       se = get_se(reference_wav)

   # Run tone color conversion
   converted = _tone_converter.convert(content, se, style_strength=style_strength)

   y = converted

   # Optional pitch & formant adjustments (light touch)
   if abs(pitch_shift) > 1e-3:
       y = librosa.effects.pitch_shift(y.astype(np.float32), 16000, n_steps=pitch_shift)
   if abs(formant_shift) > 1e-3:
       # crude formant-esque EQ tilt using shelving filter via librosa
       import scipy.signal as sps
       w = 2 * np.pi * 1500 / 16000
       b, a = sps.iirfilter(2, Wn=w/np.pi, btype='high', ftype='butter') if formant_shift > 0 else sps.iirfilter(2, Wn=w/np.pi, btype='low', ftype='butter')
       y = sps.filtfilt(b, a, y)

   out_path = save_audio(y, 16000)
   return out_path


def process(reference, track, acapella=None, separate=False, style_strength=0.8, pitch_shift=0.0, formant_shift=0.0, remix=False, vocal_gain_db=0.0, inst_gain_db=0.0):
   if reference is None:
       raise gr.Error("Загрузите референс голоса (reference_wav)")

   # Prepare vocals & instrumental
   vocals_path = None
   instrumental_path = None

   if acapella is not None:
       vocals_path = acapella
   elif separate and track is not None:
       vocals_path, instrumental_path = separate_vocals(track)
   elif track is not None:
       vocals_path = track
   else:
       raise gr.Error("Загрузите либо полный трек, либо акапеллу")

   # Convert vocal
   converted_vocal = convert_voice(reference, vocals_path, style_strength, pitch_shift, formant_shift)

   if not remix:
       return converted_vocal, None

   # Remix back to instrumental (if missing, make silence)
   if instrumental_path is None and track is not None and separate:
       _, instrumental_path = separate_vocals(track)
   if instrumental_path is None:
       # create silent instrumental length matched to converted vocal
       y, sr = load_audio(converted_vocal)
       inst = np.zeros_like(y)
       instrumental_path = save_audio(inst, sr)

   cv, sr = load_audio(converted_vocal)
   inst, isr = load_audio(instrumental_path)
   if isr != sr:
       inst = librosa.resample(inst, orig_sr=isr, target_sr=sr)

   cv = match_length(cv, inst)
   # apply gains
   cv = cv * (10 ** (vocal_gain_db / 20.0))
   inst = inst * (10 ** (inst_gain_db / 20.0))

   mix = cv + inst
   mix_path = save_audio(mix, sr)
   return converted_vocal, mix_path


with gr.Blocks(theme=gr.themes.Soft()) as demo:
   gr.Markdown("""
   # 🎙️ Reference Voice Conversion
   Загрузите **референс** голоса и **трек/акапеллу** — получайте конвертированный вокал под тембр референса. Опционально: разделение вокала (Demucs) и ремикс в инструментал.
   """)
   with gr.Row():
       with gr.Column():
           ref = gr.Audio(label="Reference Voice (clean, 5–20s)", type="filepath")
           track = gr.Audio(label="Source Track (full mix)", type="filepath")
           acap = gr.Audio(label="Source Acapella (optional)", type="filepath")
           separate = gr.Checkbox(label="Разделить вокал Demucs", value=True)
           remix = gr.Checkbox(label="Сделать финальный микс (вокал + инструментал)", value=True)
       with gr.Column():
           style = gr.Slider(0.0, 1.0, value=0.85, step=0.01, label="Сила стиля (тембр)")
           pitch = gr.Slider(-6, 6, value=0, step=0.5, label="Pitch shift (полутонов)")
           formant = gr.Slider(-1.0, 1.0, value=0.0, step=0.1, label="Formant tilt (экспериментально)")
           vgain = gr.Slider(-12, 12, value=0, step=0.5, label="Гейн вокала (dB)")
           igain = gr.Slider(-12, 12, value=0, step=0.5, label="Гейн инструментала (dB)")
           btn = gr.Button("Convert")
   with gr.Row():
       out_vocal = gr.Audio(label="Converted Vocal", type="filepath")
       out_mix = gr.Audio(label="Remix (Vocal + Instrumental)", type="filepath")

   btn.click(
       fn=process,
       inputs=[ref, track, acap, separate, style, pitch, formant, remix, vgain, igain],
       outputs=[out_vocal, out_mix]
   )

if __name__ == "__main__":
   demo.launch()