Spaces:
Runtime error
Runtime error
| import os | |
| import sys | |
| from music.search import get_youtube, download_random | |
| from utils.utils import log_execution_time | |
| from vits.models import SynthesizerInfer | |
| import whisper.inference | |
| from omegaconf import OmegaConf | |
| import torchcrepe | |
| import torch | |
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import soundfile | |
| from pydub import AudioSegment | |
| import uuid | |
| from torchspleeter.utils import sound_split | |
| from torchspleeter.splitter import Splitter | |
| import logging | |
| logging.getLogger('numba').setLevel(logging.WARNING) | |
| logging.getLogger('markdown_it').setLevel(logging.WARNING) | |
| logging.getLogger('urllib3').setLevel(logging.WARNING) | |
| logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
| def load_svc_model(checkpoint_path, model): | |
| assert os.path.isfile(checkpoint_path) | |
| checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") | |
| saved_state_dict = checkpoint_dict["model_g"] | |
| state_dict = model.state_dict() | |
| new_state_dict = {} | |
| for k, v in state_dict.items(): | |
| new_state_dict[k] = saved_state_dict[k] | |
| model.load_state_dict(new_state_dict) | |
| return model | |
| def compute_f0_nn(filename, device): | |
| audio, sr = librosa.load(filename, sr=16000) | |
| assert sr == 16000 | |
| # Load audio | |
| audio = torch.tensor(np.copy(audio))[None] | |
| # Here we'll use a 20 millisecond hop length | |
| hop_length = 320 | |
| # Provide a sensible frequency range for your domain (upper limit is 2006 Hz) | |
| # This would be a reasonable range for speech | |
| fmin = 50 | |
| fmax = 1000 | |
| # Select a model capacity--one of "tiny" or "full" | |
| model = "tiny" | |
| # Pick a batch size that doesn't cause memory errors on your gpu | |
| batch_size = 512 | |
| # Compute pitch using first gpu | |
| pitch, periodicity = torchcrepe.predict( | |
| audio, | |
| sr, | |
| hop_length, | |
| fmin, | |
| fmax, | |
| model, | |
| batch_size=batch_size, | |
| device=device, | |
| return_periodicity=True, | |
| ) | |
| pitch = np.repeat(pitch, 2, -1) # 320 -> 160 * 2 | |
| periodicity = np.repeat(periodicity, 2, -1) # 320 -> 160 * 2 | |
| # CREPE was not trained on silent audio. some error on silent need filter. | |
| periodicity = torchcrepe.filter.median(periodicity, 9) | |
| pitch = torchcrepe.filter.mean(pitch, 9) | |
| pitch[periodicity < 0.1] = 0 | |
| pitch = pitch.squeeze(0) | |
| return pitch | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| hp = OmegaConf.load("configs/base.yaml") | |
| model = SynthesizerInfer( | |
| hp.data.filter_length // 2 + 1, | |
| hp.data.segment_size // hp.data.hop_length, | |
| hp) | |
| load_svc_model("vits_pretrain/sovits5.0-48k-debug.pth", model) | |
| model.eval() | |
| model.to(device) | |
| model.enc_p = torch.quantization.quantize_dynamic(model.enc_p, {torch.nn.Linear}, dtype=torch.qint8) | |
| whisper_model = whisper.inference.load_model(os.path.join("whisper_pretrain", "medium.pt")) | |
| whisper_quant_model = torch.quantization.quantize_dynamic( | |
| whisper_model, {torch.nn.Linear}, dtype=torch.qint8 | |
| ) | |
| splitter_model = Splitter.from_pretrained(os.path.join("torchspleeter/models/2stems", "spleeter.pth")).to(device).eval() | |
| splitter_quant_model = torch.quantization.quantize_dynamic( | |
| splitter_model, {torch.nn.Linear}, dtype=torch.qint8 | |
| ) | |
| # warm up | |
| # separator.separate_to_file('warm.wav', '/tmp/warm') | |
| def svc_change(argswave, argsspk): | |
| argsppg = "svc_tmp_quant.ppg.npy" | |
| # whisper.inference.pred_ppg(whisper_model, argswave, argsppg) | |
| whisper.inference.pred_ppg(whisper_quant_model, argswave, argsppg) | |
| # os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}") | |
| spk = np.load(argsspk) | |
| spk = torch.FloatTensor(spk) | |
| ppg = np.load(argsppg) | |
| ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2 | |
| ppg = torch.FloatTensor(ppg) | |
| pit = compute_f0_nn(argswave, device) | |
| pit = torch.FloatTensor(pit) | |
| len_pit = pit.size()[0] | |
| len_ppg = ppg.size()[0] | |
| len_min = min(len_pit, len_ppg) | |
| pit = pit[:len_min] | |
| ppg = ppg[:len_min, :] | |
| with torch.no_grad(): | |
| spk = spk.unsqueeze(0).to(device) | |
| source = pit.unsqueeze(0).to(device) | |
| source = model.pitch2source(source) | |
| hop_size = hp.data.hop_length | |
| all_frame = len_min | |
| hop_frame = 10 | |
| out_chunk = 2500 # 25 S | |
| out_index = 0 | |
| out_audio = [] | |
| has_audio = False | |
| while out_index + out_chunk < all_frame: | |
| has_audio = True | |
| if out_index == 0: # start frame | |
| cut_s = out_index | |
| cut_s_48k = 0 | |
| else: | |
| cut_s = out_index - hop_frame | |
| cut_s_48k = hop_frame * hop_size | |
| if out_index + out_chunk + hop_frame > all_frame: # end frame | |
| cut_e = out_index + out_chunk | |
| cut_e_48k = 0 | |
| else: | |
| cut_e = out_index + out_chunk + hop_frame | |
| cut_e_48k = -1 * hop_frame * hop_size | |
| sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device) | |
| sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device) | |
| sub_len = torch.LongTensor([cut_e - cut_s]).to(device) | |
| sub_har = source[:, :, cut_s * | |
| hop_size:cut_e * hop_size].to(device) | |
| sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har) | |
| sub_out = sub_out[0, 0].data.cpu().detach().numpy() | |
| sub_out = sub_out[cut_s_48k:cut_e_48k] | |
| out_audio.extend(sub_out) | |
| out_index = out_index + out_chunk | |
| if out_index < all_frame: | |
| if has_audio: | |
| cut_s = out_index - hop_frame | |
| cut_s_48k = hop_frame * hop_size | |
| else: | |
| cut_s = 0 | |
| cut_s_48k = 0 | |
| sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device) | |
| sub_pit = pit[cut_s:].unsqueeze(0).to(device) | |
| sub_len = torch.LongTensor([all_frame - cut_s]).to(device) | |
| sub_har = source[:, :, cut_s * hop_size:].to(device) | |
| sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har) | |
| sub_out = sub_out[0, 0].data.cpu().detach().numpy() | |
| sub_out = sub_out[cut_s_48k:] | |
| out_audio.extend(sub_out) | |
| out_audio = np.asarray(out_audio) | |
| return out_audio | |
| def svc_main(name, sid, input_audio): | |
| if input_audio is None: | |
| return "You need to upload an audio", None | |
| sampling_rate, audio = input_audio | |
| integer_dtypes = [np.int8, np.int16, np.int32, np.int64] | |
| if audio.dtype in integer_dtypes: | |
| audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | |
| uuid_value = uuid.uuid4() | |
| uuid_string = str(uuid_value) | |
| input_audio_tmp_file = f'{uuid_string}.wav' | |
| tmpfile_path = f'/tmp/{uuid_string}' | |
| if not os.path.exists(tmpfile_path): | |
| os.makedirs(tmpfile_path) | |
| # | |
| # prediction = separator.separate(audio) | |
| # vocals, accompaniment = prediction["vocals"], prediction["accompaniment"] | |
| soundfile.write(input_audio_tmp_file, audio, sampling_rate, format="wav") | |
| # make it 30s | |
| song = AudioSegment.from_mp3(input_audio_tmp_file) | |
| # pydub does things in milliseconds | |
| length = len(song) | |
| left_idx = length / 2 - 15 * 1000 | |
| right_idx = length / 2 + 15 * 1000 | |
| if left_idx < 0: | |
| left_idx = 0 | |
| if right_idx > length: | |
| right_idx = length | |
| middle_30s = song[left_idx:right_idx] | |
| middle_30s.export(input_audio_tmp_file, format="wav") | |
| soundfile.read(input_audio_tmp_file, dtype='float32') | |
| sound_split(splitter_quant_model, input_audio_tmp_file, tmpfile_path) | |
| curr_tmp_path = tmpfile_path | |
| vocals_filepath = os.path.join(curr_tmp_path, 'vocals.wav') | |
| accompaniment_filepath = os.path.join(curr_tmp_path, 'accompaniment.wav') | |
| vocals, sampling_rate = soundfile.read(vocals_filepath) | |
| if len(vocals.shape) > 1: | |
| vocals = librosa.to_mono(vocals.transpose(1, 0)) | |
| if sampling_rate != 16000: | |
| vocals = librosa.resample(vocals, orig_sr=sampling_rate, target_sr=16000) | |
| if len(vocals) > 16000 * 100: | |
| vocals = vocals[:16000 * 100] | |
| wav_path = os.path.join(curr_tmp_path, "temp.wav") | |
| soundfile.write(wav_path, vocals, 16000, format="wav") | |
| out_vocals = svc_change(wav_path, f"configs/singers/singer00{sid}.npy") | |
| out_vocals_filepath = os.path.join(curr_tmp_path, 'out_vocals.wav') | |
| soundfile.write(out_vocals_filepath, out_vocals, 48000, format="wav") | |
| print(f"out_vocals_filepath: {out_vocals_filepath}") | |
| sound1 = AudioSegment.from_file(out_vocals_filepath) | |
| sound2 = AudioSegment.from_file(accompaniment_filepath) | |
| played_togther = sound1.overlay(sound2) | |
| result_path = os.path.join(curr_tmp_path, 'out_song.wav') | |
| played_togther.export(result_path, format="wav") | |
| print(f"result_path: {result_path}") | |
| result, sampling_rate = soundfile.read(result_path, dtype=np.int16) | |
| return "Success", (sampling_rate, result) | |
| def auto_search(name): | |
| save_music_path = '/tmp/downloaded' | |
| if not os.path.exists(save_music_path): | |
| os.makedirs(save_music_path) | |
| config = {'logfilepath': 'musicdl.log', save_music_path: save_music_path, 'search_size_per_source': 5, | |
| 'proxies': {}} | |
| save_path = os.path.join(save_music_path, name + '.mp3') | |
| # youtube | |
| get_youtube(name, os.path.join(save_music_path, name)) | |
| # task1 = threading.Thread( | |
| # target=get_youtube, | |
| # args=(name, os.path.join(save_music_path, name)) | |
| # ) | |
| # task1.start() | |
| # task2 = threading.Thread( | |
| # target=download_random, | |
| # args=(name, config, save_path) | |
| # ) | |
| # task2.start() | |
| # task1.join(timeout=20) | |
| # task2.join(timeout=10) | |
| if not os.path.exists(save_path): | |
| return "Not Found", None | |
| signal, sampling_rate = soundfile.read(save_path, dtype=np.int16) | |
| # signal, sampling_rate = open_audio(save_path) | |
| return "Found a music", (sampling_rate, signal) | |
| def main(): | |
| app = gr.Blocks() | |
| try: | |
| with app: | |
| title = "Singer Voice Clone 0.1 Demo" | |
| desc = """ small singer voice clone Demo App. <br /> | |
| Enter keywords auto search music to clone or upload music yourself <br /> | |
| It's just a simplified demo, you can use more advanced features optimize music quality <br />""" | |
| tutorial_link = "https://docs.cworld.ai/docs/cworld-ai/quick-start-singer" | |
| gr.HTML( | |
| f""" | |
| <div style="text-align: center; margin: 0 auto;"> | |
| <a href="https://cworld.ai"> | |
| <svg style="margin: 0 auto;" width="155" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 407 100"> | |
| <g id="SvgjsG2746" | |
| transform="matrix(0.8454106280193237,0,0,0.8454106280193237,-4.2270531400966185,-4.2270531400966185)" | |
| fill="#111"> | |
| <g xmlns="http://www.w3.org/2000/svg"> | |
| <g> | |
| <g> | |
| <path d="M50,11c21.5,0,39,17.5,39,39S71.5,89,50,89S11,71.5,11,50S28.5,11,50,11 M50,5C25.1,5,5,25.1,5,50s20.1,45,45,45 s45-20.1,45-45S74.9,5,50,5L50,5z"></path> | |
| </g> | |
| </g> | |
| <path d="M55,75H45v-5c0-2.8,2.2-5,5-5h0c2.8,0,5,2.2,5,5V75z"></path> | |
| <rect x="25" y="35" width="10" height="20"></rect> | |
| <rect x="65" y="35" width="10" height="20"></rect> | |
| </g> | |
| </g> | |
| <g id="SvgjsG2747" | |
| transform="matrix(3.3650250410766605,0,0,3.3650250410766605,93.98098208712985,-3.546415304677616)" | |
| fill="#111"> | |
| <path | |
| d="M8.1 17.42 l1.42 1.28 c-0.94 1.04 -2.28 1.5 -3.78 1.5 c-2.84 0 -5.14 -2.18 -5.14 -5.12 s2.3 -5.14 5.14 -5.14 c1.5 0 2.84 0.46 3.78 1.5 l-1.42 1.28 c-0.58 -0.78 -1.42 -1.08 -2.36 -1.08 c-1.7 0 -3.08 1.42 -3.08 3.44 c0 2 1.38 3.44 3.08 3.44 c0.94 0 1.78 -0.3 2.36 -1.1 z M23.42 10.12 l2.06 0 l-3.76 9.88 l-1.26 0 l-2.46 -6.4 l-2.44 6.4 l-1.26 0 l-3.78 -9.88 l2.08 0 l2.34 6.9 l2.06 -6.08 l0.26 -0.82 l1.48 0 l0.28 0.82 l2.06 6.08 z M31.62 11.64 c-1.7 0 -3.08 1.42 -3.08 3.44 c0 2 1.38 3.44 3.08 3.44 s3.08 -1.44 3.08 -3.44 c0 -2.02 -1.38 -3.44 -3.08 -3.44 z M31.62 9.94 c2.84 0 5.14 2.2 5.14 5.14 s-2.3 5.12 -5.14 5.12 s-5.14 -2.18 -5.14 -5.12 s2.3 -5.14 5.14 -5.14 z M44.9 10.24 l-0.44 1.62 c-0.14 -0.08 -0.58 -0.22 -0.94 -0.22 c-1.7 0 -2.5 1.62 -2.5 3.62 l0 4.74 l-2.06 0 l0 -9.88 l2.06 0 l0 1.4 c0.24 -0.92 1.3 -1.58 2.48 -1.58 c0.54 0 1.12 0.14 1.4 0.3 z M48.379999999999995 4.619999999999999 l0 15.38 l-2.08 0 l0 -15.38 l2.08 0 z M50.98 15.08 c0 -2.94 2.1 -5.14 4.94 -5.14 c0.98 0 2.18 0.42 2.84 0.96 l0 -5.9 l2.08 0 l0 15 l-2.08 0 l0 -0.74 c-0.78 0.58 -1.86 0.94 -2.84 0.94 c-2.84 0 -4.94 -2.18 -4.94 -5.12 z M53.06 15.08 c0 2 1.38 3.44 3.06 3.44 c1.12 0 2.12 -0.52 2.64 -1.58 c0.28 -0.54 0.44 -1.18 0.44 -1.86 s-0.16 -1.32 -0.44 -1.88 c-0.52 -1.06 -1.52 -1.56 -2.64 -1.56 c-1.68 0 -3.06 1.42 -3.06 3.44 z M66.46 18.78 c0 0.8 -0.62 1.42 -1.42 1.42 c-0.78 0 -1.4 -0.62 -1.4 -1.42 c0 -0.76 0.62 -1.38 1.4 -1.38 c0.8 0 1.42 0.62 1.42 1.38 z M73.08 9.92 c2.84 0 3.98 1.72 3.98 3.18 l0 6.9 l-2.06 0 l0 -1.08 c-0.72 0.98 -2 1.26 -2.8 1.26 c-2.26 0 -3.74 -1.32 -3.74 -3.08 c0 -2.46 1.84 -3.34 3.74 -3.34 l2.8 0 l0 -0.66 c0 -0.62 -0.24 -1.48 -1.92 -1.48 c-0.94 0 -1.8 0.5 -2.36 1.28 l-1.42 -1.28 c0.94 -1.04 2.28 -1.7 3.78 -1.7 z M75 16.92 l0 -1.48 l-2.52 0 c-1.22 0 -2.08 0.62 -1.94 1.74 c0.12 0.94 0.88 1.32 1.94 1.32 c1.9 0 2.52 -0.9 2.52 -1.58 z M81.9 10.12 l0 9.88 l-2.06 0 l0 -9.88 l2.06 0 z M82 6.5 c0 0.64 -0.5 1.14 -1.14 1.14 c-0.62 0 -1.12 -0.5 -1.12 -1.14 c0 -0.62 0.5 -1.12 1.12 -1.12 c0.64 0 1.14 0.5 1.14 1.12 z"></path> | |
| </g> | |
| </svg> | |
| </a> | |
| <div | |
| style=" | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 0.8rem; | |
| font-size: 1.75rem; | |
| " | |
| > | |
| <h1 style="font-weight: 900; margin-bottom: 7px;margin-top:5px"> | |
| {title} | |
| </h1> | |
| </div> | |
| <p style="margin-bottom: 10px; font-size: 94%; line-height: 23px;"> | |
| {desc} | |
| There is the <a href="{tutorial_link}"> tutorial </a> | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| examples = [ | |
| ['李荣浩不将就', '47', 'wavs/李荣浩不将就.mp3'], | |
| ['周杰伦告白气球', '47', 'wavs/周杰伦告白气球.mp3'], | |
| ['薛之谦天后', '51', 'wavs/薛之谦天后.mp3'], | |
| ] | |
| with gr.Group(): | |
| with gr.Box(): | |
| with gr.Row(): | |
| with gr.Column(): | |
| sid = gr.Dropdown(label="Singer", choices=["22", "33", "47", "51"], value="47") | |
| vc_input2 = gr.Textbox(label="Music Name") | |
| vc_search = gr.Button("Auto Search", variant="primary") | |
| with gr.Column(): | |
| vc_input3 = gr.Audio(label="Upload Music Yourself") | |
| vc_submit = gr.Button("Convert", variant="primary") | |
| with gr.Column(): | |
| vc_output1 = gr.Textbox(label="Run Status") | |
| vc_output2 = gr.Audio(label="Result Audio") | |
| vc_search.click(auto_search, [vc_input2], [vc_output1, vc_input3]) | |
| vc_submit.click(svc_main, [vc_input2, sid, vc_input3], [vc_output1, vc_output2]) | |
| with gr.Row(): | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[vc_input2, sid, vc_input3], | |
| outputs=[vc_output1, vc_output2], | |
| fn=svc_main, | |
| cache_examples=True, | |
| ) | |
| app.launch() | |
| except KeyboardInterrupt: | |
| app.close() | |
| sys.exit(0) | |
| if __name__ == '__main__': | |
| main() | |