from score import load_model from predict import loadWav import torch import torch.nn.functional as F import gradio as gr import time model = load_model("wavlm_ecapa.model") model.eval() def calc_voxsim(inp_path, ref_path): start = time.time() inp_wavs, inp_wav = loadWav(inp_path) ref_wavs, ref_wav = loadWav(ref_path) print("loadWav time: ", time.time() - start) inp_wavs = torch.FloatTensor(inp_wavs) inp_wav = torch.FloatTensor(inp_wav) ref_wavs = torch.FloatTensor(ref_wavs) ref_wav = torch.FloatTensor(ref_wav) print("torch.FloatTensor time: ", time.time() - start) with torch.no_grad(): input_emb_1 = F.normalize(model.forward(inp_wavs), p=2, dim=1) print("input_emb_1 time: ", time.time() - start) input_emb_2 = F.normalize(model.forward(inp_wav), p=2, dim=1) print("input_emb_2 time: ", time.time() - start) ref_emb_1 = F.normalize(model.forward(ref_wavs), p=2, dim=1) print("ref_emb_1 time: ", time.time() - start) ref_emb_2 = F.normalize(model.forward(ref_wav), p=2, dim=1) print("ref_emb_2 time: ", time.time() - start) score_1 = torch.mean(torch.matmul(input_emb_1, ref_emb_1.T)) score_2 = torch.mean(torch.matmul(input_emb_2, ref_emb_2.T)) score = (score_1 + score_2) / 2 print("score time: ", time.time() - start) return score.detach().cpu().numpy() description = """ Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset. This demo only accepts .wav format. Best at 16 kHz sampling rate. Paper is available [here](https://arxiv.org/abs/2407.18505) """ iface = gr.Interface( fn=calc_voxsim, inputs=( gr.Audio(label="Input Audio"), gr.Audio(label="Reference Audio") ), outputs="text", title="voice similarity with VoxSim", description=description, ).launch()