|
|
import os |
|
|
import json |
|
|
import argparse |
|
|
import requests |
|
|
import traceback |
|
|
import logging |
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import librosa |
|
|
import torch |
|
|
import asyncio |
|
|
import soundfile as sf |
|
|
import edge_tts |
|
|
from datetime import datetime |
|
|
from fairseq import checkpoint_utils |
|
|
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono |
|
|
from vc_infer_pipeline import VC |
|
|
from config import ( |
|
|
is_half, |
|
|
device |
|
|
) |
|
|
logging.getLogger("numba").setLevel(logging.WARNING) |
|
|
|
|
|
limitation = os.getenv("SYSTEM") == "spaces" |
|
|
|
|
|
|
|
|
def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy, |
|
|
urlAudio, |
|
|
f0_up_key, |
|
|
f0_method, |
|
|
index_rate, |
|
|
tts_mode, |
|
|
tts_text, |
|
|
tts_voice): |
|
|
|
|
|
try: |
|
|
if tts_mode: |
|
|
if len(tts_text) > 100 and limitation: |
|
|
return "Text is too long", None |
|
|
if tts_text is None or tts_voice is None: |
|
|
return "You need to enter text and select a voice", None |
|
|
asyncio.run(edge_tts.Communicate( |
|
|
tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3")) |
|
|
audio, sr = librosa.load("tts.mp3", sr=16000, mono=True) |
|
|
else: |
|
|
res = requests.get(urlAudio) |
|
|
if res.status_code == 200: |
|
|
buffer = res.content |
|
|
else: |
|
|
return "Gagal memuat audio", None |
|
|
sf.write("temp.wav", buffer, samplerate=16000) |
|
|
audioSementara = sf.read("temp.wav") |
|
|
if audioSementara: |
|
|
audio, sr = librosa.load(audioSementara, sr=16000, mono=True) |
|
|
else: |
|
|
if urlAudio is None: |
|
|
return "You need to upload an audio", None |
|
|
sampling_rate, audio = sf.read("temp.wav") |
|
|
duration = audio.shape[0] / sampling_rate |
|
|
if duration > 20 and limitation: |
|
|
return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None |
|
|
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) |
|
|
if len(audio.shape) > 1: |
|
|
audio = librosa.to_mono(audio.transpose(1, 0)) |
|
|
if sampling_rate != 16000: |
|
|
audio = librosa.resample( |
|
|
audio, orig_sr=sampling_rate, target_sr=16000) |
|
|
times = [0, 0, 0] |
|
|
f0_up_key = int(f0_up_key) |
|
|
audio_opt = vc.pipeline( |
|
|
hubert_model, |
|
|
net_g, |
|
|
0, |
|
|
audio, |
|
|
times, |
|
|
f0_up_key, |
|
|
f0_method, |
|
|
file_index, |
|
|
file_big_npy, |
|
|
index_rate, |
|
|
if_f0, |
|
|
) |
|
|
print( |
|
|
f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s" |
|
|
) |
|
|
sf.write("hasil.wav", audio_opt, 16000) |
|
|
return "Success", (tgt_sr, audio_opt) |
|
|
except: |
|
|
info = traceback.format_exc() |
|
|
print(info) |
|
|
return info, (None, None) |
|
|
|
|
|
|
|
|
def load_hubert(): |
|
|
global hubert_model |
|
|
models, _, _ = checkpoint_utils.load_model_ensemble_and_task( |
|
|
["hubert_base.pt"], |
|
|
suffix="", |
|
|
) |
|
|
hubert_model = models[0] |
|
|
hubert_model = hubert_model.to(device) |
|
|
if is_half: |
|
|
hubert_model = hubert_model.half() |
|
|
else: |
|
|
hubert_model = hubert_model.float() |
|
|
hubert_model.eval() |
|
|
|
|
|
|
|
|
def change_to_tts_mode(tts_mode): |
|
|
if tts_mode: |
|
|
return gr.Audio.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True) |
|
|
else: |
|
|
return gr.Audio.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False) |
|
|
|
|
|
|
|
|
load_hubert() |
|
|
models = [] |
|
|
tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices()) |
|
|
voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list] |
|
|
with open("weights/model_info.json", "r", encoding="utf-8") as f: |
|
|
models_info = json.load(f) |
|
|
|
|
|
vc_input = "https://cdn-147.zippysha.re/RaheN03aza/1562de11-1690121778/evillin.mp3" |
|
|
vc_transpose = 20 |
|
|
vc_f0method = 'pm' |
|
|
vc_index_ratio = 1 |
|
|
tts_mode = False |
|
|
tts_text = False |
|
|
tts_voice = False |
|
|
for name, info in models_info.items(): |
|
|
if not info['enable']: |
|
|
continue |
|
|
title = info['title'] |
|
|
author = info.get("author", None) |
|
|
cover = f"weights/{name}/{info['cover']}" |
|
|
index = f"weights/{name}/{info['feature_retrieval_library']}" |
|
|
npy = f"weights/{name}/{info['feature_file']}" |
|
|
cpt = torch.load(f"weights/{name}/{name}.pth", map_location="cpu") |
|
|
tgt_sr = cpt["config"][-1] |
|
|
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] |
|
|
if_f0 = cpt.get("f0", 1) |
|
|
if if_f0 == 1: |
|
|
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) |
|
|
else: |
|
|
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) |
|
|
del net_g.enc_q |
|
|
|
|
|
print(net_g.load_state_dict(cpt["weight"], strict=False)) |
|
|
net_g.eval().to(device) |
|
|
if is_half: |
|
|
net_g = net_g.half() |
|
|
else: |
|
|
net_g = net_g.float() |
|
|
vc = VC(tgt_sr, device, is_half) |
|
|
create_vc_fn( |
|
|
tgt_sr, net_g, vc, if_f0, index, npy, vc_input, vc_transpose, vc_f0method, vc_index_ratio, |
|
|
tts_mode, tts_text, tts_voice) |
|
|
|