RVC-Models

Build error

App Files Files Community

nevreal commited on Sep 19, 2024

Commit

780f01d

verified ·

1 Parent(s): aaced9c

Create rvc.py

Browse files

Files changed (1) hide show

rvc.py +243 -0

rvc.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os
+import glob
+import json
+import traceback
+import logging
+import gradio as gr
+import numpy as np
+import librosa
+import torch
+import asyncio
+import edge_tts
+import yt_dlp
+import ffmpeg
+import subprocess
+import sys
+import io
+import wave
+from datetime import datetime
+from fairseq import checkpoint_utils
+from lib.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs768NSFsid,
+    SynthesizerTrnMs768NSFsid_nono,
+)
+from vc_infer_pipeline import VC
+from config import Config
+config = Config()
+logging.getLogger("numba").setLevel(logging.WARNING)
+limitation = os.getenv("SYSTEM") == "spaces"
+audio_mode = []
+f0method_mode = []
+f0method_info = ""
+if limitation is True:
+    audio_mode = ["Upload audio", "TTS Audio"]
+    f0method_mode = ["pm", "crepe", "harvest"]
+    f0method_info = "PM is fast, rmvpe is middle, Crepe or harvest is good but it was extremely slow (Default: PM)"
+else:
+    audio_mode = ["Upload audio", "Youtube", "TTS Audio"]
+    f0method_mode = ["pm", "crepe", "harvest"]
+    f0method_info = "PM is fast, rmvpe is middle. Crepe or harvest is good but it was extremely slow (Default: PM))"
+if os.path.isfile("rmvpe.pt"):
+    f0method_mode.insert(2, "rmvpe")
+def create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, file_index):
+    def vc_fn(
+        vc_audio_mode,
+        vc_input,
+        vc_upload,
+        tts_text,
+        tts_voice,
+        f0_up_key,
+        f0_method,
+        index_rate,
+        filter_radius,
+        resample_sr,
+        rms_mix_rate,
+        protect,
+    ):
+        try:
+            if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
+                audio, sr = librosa.load(vc_input, sr=16000, mono=True)
+            elif vc_audio_mode == "Upload audio":
+                if vc_upload is None:
+                    return "You need to upload an audio", None
+                sampling_rate, audio = vc_upload
+                duration = audio.shape[0] / sampling_rate
+                if duration > 360 and limitation:
+                    return "Please upload an audio file that is less than 1 minute.", None
+                audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
+                if len(audio.shape) > 1:
+                    audio = librosa.to_mono(audio.transpose(1, 0))
+                if sampling_rate != 16000:
+                    audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
+            elif vc_audio_mode == "TTS Audio":
+                if len(tts_text) > 600 and limitation:
+                    return "Text is too long", None
+                if tts_text is None or tts_voice is None:
+                    return "You need to enter text and select a voice", None
+                asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
+                audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
+                vc_input = "tts.mp3"
+            times = [0, 0, 0]
+            f0_up_key = int(f0_up_key)
+            audio_opt = vc.pipeline(
+                hubert_model,
+                net_g,
+                0,
+                audio,
+                vc_input,
+                times,
+                f0_up_key,
+                f0_method,
+                file_index,
+                # file_big_npy,
+                index_rate,
+                if_f0,
+                filter_radius,
+                tgt_sr,
+                resample_sr,
+                rms_mix_rate,
+                version,
+                protect,
+                f0_file=None,
+            )
+            info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
+            print(f"{model_title} | {info}")
+            return info, (tgt_sr, audio_opt)
+        except:
+            info = traceback.format_exc()
+            print(info)
+            return info, (None, None)
+    return vc_fn
+def load_model():
+    categories = []
+    with open("weights/folder_info.json", "r", encoding="utf-8") as f:
+        folder_info = json.load(f)
+    for category_name, category_info in folder_info.items():
+        if not category_info['enable']:
+            continue
+        category_title = category_info['title']
+        category_folder = category_info['folder_path']
+        models = []
+        with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
+            models_info = json.load(f)
+        for character_name, info in models_info.items():
+            if not info['enable']:
+                continue
+            model_title = info['title']
+            model_name = info['model_path']
+            model_author = info.get("author", None)
+            model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
+            model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
+            cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
+            tgt_sr = cpt["config"][-1]
+            cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
+            if_f0 = cpt.get("f0", 1)
+            version = cpt.get("version", "v1")
+            if version == "v1":
+                if if_f0 == 1:
+                    net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
+                else:
+                    net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+                model_version = "V1"
+            elif version == "v2":
+                if if_f0 == 1:
+                    net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
+                else:
+                    net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+                model_version = "V2"
+            del net_g.enc_q
+            print(net_g.load_state_dict(cpt["weight"], strict=False))
+            net_g.eval().to(config.device)
+            if config.is_half:
+                net_g = net_g.half()
+            else:
+                net_g = net_g.float()
+            vc = VC(tgt_sr, config)
+            print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
+            models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, model_index)))
+        categories.append([category_title, category_folder, models])
+    return categories
+def cut_vocal_and_inst(url, audio_provider, split_model):
+    if url != "":
+        if not os.path.exists("dl_audio"):
+            os.mkdir("dl_audio")
+        if audio_provider == "Youtube":
+            ydl_opts = {
+            'format': 'bestaudio/best',
+            'postprocessors': [{
+                'key': 'FFmpegExtractAudio',
+                'preferredcodec': 'wav',
+            }],
+            "outtmpl": 'dl_audio/youtube_audio',
+            }
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                ydl.download([url])
+            audio_path = "dl_audio/youtube_audio.wav"
+        else:
+            # Spotify doesnt work.
+            # Need to find other solution soon.
+            '''
+            command = f"spotdl download {url} --output dl_audio/.wav"
+            result = subprocess.run(command.split(), stdout=subprocess.PIPE)
+            print(result.stdout.decode())
+            audio_path = "dl_audio/spotify_audio.wav"
+            '''
+        if split_model == "htdemucs":
+            command = f"demucs --two-stems=vocals {audio_path} -o output"
+            result = subprocess.run(command.split(), stdout=subprocess.PIPE)
+            print(result.stdout.decode())
+            return "output/htdemucs/youtube_audio/vocals.wav", "output/htdemucs/youtube_audio/no_vocals.wav", audio_path, "output/htdemucs/youtube_audio/vocals.wav"
+        else:
+            command = f"demucs --two-stems=vocals -n mdx_extra_q {audio_path} -o output"
+            result = subprocess.run(command.split(), stdout=subprocess.PIPE)
+            print(result.stdout.decode())
+            return "output/mdx_extra_q/youtube_audio/vocals.wav", "output/mdx_extra_q/youtube_audio/no_vocals.wav", audio_path, "output/mdx_extra_q/youtube_audio/vocals.wav"
+    else:
+        raise gr.Error("URL Required!")
+        return None, None, None, None
+def combine_vocal_and_inst(audio_data, audio_volume, split_model):
+    if not os.path.exists("output/result"):
+        os.mkdir("output/result")
+    vocal_path = "output/result/output.wav"
+    output_path = "output/result/combine.mp3"
+    if split_model == "htdemucs":
+        inst_path = "output/htdemucs/youtube_audio/no_vocals.wav"
+    else:
+        inst_path = "output/mdx_extra_q/youtube_audio/no_vocals.wav"
+    with wave.open(vocal_path, "w") as wave_file:
+        wave_file.setnchannels(1)
+        wave_file.setsampwidth(2)
+        wave_file.setframerate(audio_data[0])
+        wave_file.writeframes(audio_data[1].tobytes())
+    command =  f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume={audio_volume}dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
+    result = subprocess.run(command.split(), stdout=subprocess.PIPE)
+    print(result.stdout.decode())
+    return output_path
+def load_hubert():
+    global hubert_model
+    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+        ["hubert_base.pt"],
+        suffix="",
+    )
+    hubert_model = models[0]
+    hubert_model = hubert_model.to(config.device)
+    if config.is_half:
+        hubert_model = hubert_model.half()
+    else:
+        hubert_model = hubert_model.float()
+    hubert_model.eval()