import os import json import argparse import requests import traceback import logging import gradio as gr import numpy as np import librosa import torch import asyncio import soundfile as sf import edge_tts from datetime import datetime from fairseq import checkpoint_utils from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono from vc_infer_pipeline import VC from config import ( is_half, device ) logging.getLogger("numba").setLevel(logging.WARNING) # limit audio length in huggingface spaces limitation = os.getenv("SYSTEM") == "spaces" def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy, urlAudio, f0_up_key, f0_method, index_rate, tts_mode, tts_text, tts_voice): try: if tts_mode: if len(tts_text) > 100 and limitation: return "Text is too long", None if tts_text is None or tts_voice is None: return "You need to enter text and select a voice", None asyncio.run(edge_tts.Communicate( tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3")) audio, sr = librosa.load("tts.mp3", sr=16000, mono=True) else: res = requests.get(urlAudio) if res.status_code == 200: buffer = res.content else: return "Gagal memuat audio", None sf.write("temp.wav", buffer, samplerate=16000) audioSementara = sf.read("temp.wav") if audioSementara: audio, sr = librosa.load(audioSementara, sr=16000, mono=True) else: if urlAudio is None: return "You need to upload an audio", None sampling_rate, audio = sf.read("temp.wav") duration = audio.shape[0] / sampling_rate if duration > 20 and limitation: return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) if sampling_rate != 16000: audio = librosa.resample( audio, orig_sr=sampling_rate, target_sr=16000) times = [0, 0, 0] f0_up_key = int(f0_up_key) audio_opt = vc.pipeline( hubert_model, net_g, 0, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, ) print( f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s" ) sf.write("hasil.wav", audio_opt, 16000) return "Success", (tgt_sr, audio_opt) except: info = traceback.format_exc() print(info) return info, (None, None) def load_hubert(): global hubert_model models, _, _ = checkpoint_utils.load_model_ensemble_and_task( ["hubert_base.pt"], suffix="", ) hubert_model = models[0] hubert_model = hubert_model.to(device) if is_half: hubert_model = hubert_model.half() else: hubert_model = hubert_model.float() hubert_model.eval() def change_to_tts_mode(tts_mode): if tts_mode: return gr.Audio.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True) else: return gr.Audio.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False) load_hubert() models = [] tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices()) voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list] with open("weights/model_info.json", "r", encoding="utf-8") as f: models_info = json.load(f) vc_input = "https://cdn-147.zippysha.re/RaheN03aza/1562de11-1690121778/evillin.mp3" vc_transpose = 20 vc_f0method = 'pm' # pm atau harvest vc_index_ratio = 1 tts_mode = False tts_text = False tts_voice = False for name, info in models_info.items(): if not info['enable']: continue title = info['title'] author = info.get("author", None) cover = f"weights/{name}/{info['cover']}" index = f"weights/{name}/{info['feature_retrieval_library']}" npy = f"weights/{name}/{info['feature_file']}" cpt = torch.load(f"weights/{name}/{name}.pth", map_location="cpu") tgt_sr = cpt["config"][-1] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk if_f0 = cpt.get("f0", 1) if if_f0 == 1: net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) del net_g.enc_q # 不加这一行清不干净, 真奇葩 print(net_g.load_state_dict(cpt["weight"], strict=False)) net_g.eval().to(device) if is_half: net_g = net_g.half() else: net_g = net_g.float() vc = VC(tgt_sr, device, is_half) create_vc_fn( tgt_sr, net_g, vc, if_f0, index, npy, vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice)