| import gradio as gr |
|
|
| import logging |
| numba_logger = logging.getLogger('numba') |
| numba_logger.setLevel(logging.WARNING) |
| import torch,pdb |
| import numpy as np |
| from models import SynthesizerTrnNoF0256 |
| from fairseq import checkpoint_utils |
| import torch.nn.functional as F |
| import librosa |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model_path = "checkpoint_best_legacy_500.pt" |
| print("load model(s) from {}".format(model_path)) |
| models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( |
| [model_path], |
| suffix="", |
| ) |
| model = models[0] |
| model = model.to(device) |
| model.eval() |
|
|
| net_g = SynthesizerTrnNoF0256(513,40,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,4,2,2,2],512,[16,16,4,4,4],0) |
| weights=torch.load("trump.pt", map_location=torch.device('cpu')) |
| net_g.load_state_dict(weights,strict=True) |
| net_g.eval().to(device) |
|
|
|
|
| def vc_fn( input_audio): |
| if input_audio is None: |
| return "You need to upload an audio", None |
| sampling_rate, audio = input_audio |
| duration = audio.shape[0] / sampling_rate |
| if duration > 45: |
| return "请上传小于45s的音频,需要转换长音频请使用colab", None |
| audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) |
| if len(audio.shape) > 1: |
| audio = librosa.to_mono(audio.transpose(1, 0)) |
| if sampling_rate != 16000: |
| audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) |
|
|
| print(audio.shape) |
|
|
| feats = torch.from_numpy(audio).float() |
| assert feats.dim() == 1, feats.dim() |
| feats = feats.view(1, -1) |
| padding_mask = torch.BoolTensor(feats.shape).fill_(False) |
| inputs = { |
| "source": feats.to(device), |
| "padding_mask": padding_mask.to(device), |
| "output_layer": 9, |
| } |
| with torch.no_grad(): |
| logits = model.extract_features(**inputs) |
| feats = model.final_proj(logits[0]) |
| feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) |
| p_len = min(feats.shape[1], 10000) |
| feats = feats[:, :p_len, :] |
| p_len = torch.LongTensor([p_len]).to(device) |
| with torch.no_grad(): |
| audio = net_g.infer(feats, p_len)[0][0, 0].data.cpu().float().numpy() |
|
|
| return "Success", (32000, audio) |
|
|
|
|
| app = gr.Blocks() |
| with app: |
| with gr.Tabs(): |
| with gr.TabItem("Basic"): |
| gr.Markdown(value="""""") |
| vc_input3 = gr.Audio(label="上传音频(长度小于45秒)") |
| vc_submit = gr.Button("转换", variant="primary") |
| vc_output1 = gr.Textbox(label="Output Message") |
| vc_output2 = gr.Audio(label="Output Audio") |
| vc_submit.click(vc_fn, [ vc_input3], [vc_output1, vc_output2]) |
|
|
| app.launch() |