Spaces:
Sleeping
Sleeping
| from score import load_model | |
| from predict import loadWav | |
| import torch | |
| import torch.nn.functional as F | |
| import gradio as gr | |
| model = load_model("wavlm_ecapa.model") | |
| model.eval() | |
| def calc_voxsim(inp_path, ref_path): | |
| inp_wavs, inp_wav = loadWav(inp_path) | |
| ref_wavs, ref_wav = loadWav(ref_path) | |
| inp_wavs = torch.FloatTensor(inp_wavs) | |
| inp_wav = torch.FloatTensor(inp_wav) | |
| ref_wavs = torch.FloatTensor(ref_wavs) | |
| ref_wav = torch.FloatTensor(ref_wav) | |
| with torch.no_grad(): | |
| input_emb_1 = F.normalize(model.foward(inp_wavs), p=2, dim=1) | |
| input_emb_2 = F.normalize(model.foward(inp_wav), p=2, dim=1) | |
| ref_emb_1 = F.normalize(model.foward(ref_wavs), p=2, dim=1) | |
| ref_emb_2 = F.normalize(model.foward(ref_wav), p=2, dim=1) | |
| score_1 = torch.mean(torch.matmul(input_emb_1, ref_emb_1.T)) | |
| score_2 = torch.mean(torch.matmul(input_emb_2, ref_emb_2.T)) | |
| score = (score_1 + score_2) / 2 | |
| return score.detach().cpu().numpy() | |
| description = """ | |
| Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset. | |
| This demo only accepts .wav format. Best at 16 kHz sampling rate. | |
| Paper is available [here](https://arxiv.org/abs/2407.18505) | |
| """ | |
| iface = gr.Interface( | |
| fn=calc_voxsim, | |
| inputs=( | |
| gr.Audio(label="Input Audio"), | |
| gr.Audio(label="Reference Audio") | |
| ), | |
| outputs="text", | |
| title="voice similarity with VoxSim", | |
| description=description, | |
| ).launch() |