Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import torch.nn.functional as F | |
| from ssl_ecapa_model import SSL_ECAPA_TDNN | |
| from score import loadModel | |
| from predict import loadWav | |
| import gradio as gr | |
| model = loadModel('voxsim_wavlm_ecapa.model') | |
| model.eval() | |
| def calc_voxsim(inp_path, ref_path): | |
| inp_wav = loadWav(inp_path, max_frames=0) | |
| ref_wav = loadWav(ref_path, max_frames=0) | |
| with torch.no_grad(): | |
| input_emb = F.normalize(model.forward(inp_wav), p=2, dim=1) | |
| ref_emb = F.normalize(model.forward(ref_wav), p=2, dim=1) | |
| score = torch.matmul(input_emb, ref_emb.T) | |
| return score.detach().cpu().numpy() | |
| description = """ | |
| Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset. | |
| This demo only accepts .wav format. Best at 16 kHz sampling rate. | |
| The inference process of this Spaces demo is suboptimal due to the limitations of a basic CPU. To obtain an accurate score, refer to the "[voxsim_trainer](https://github.com/kaistmm/voxsim_trainer)" repository and run the code via the CLI. | |
| Paper is available [here](https://arxiv.org/abs/2407.18505) | |
| """ | |
| iface = gr.Interface( | |
| fn=calc_voxsim, | |
| inputs=( | |
| gr.Audio(label="Input Audio", type='filepath'), | |
| gr.Audio(label="Reference Audio", type='filepath') | |
| ), | |
| outputs="text", | |
| title="voice similarity with VoxSim", | |
| description=description, | |
| ).launch() |