shethjenil's picture
Create app.py
d18d03b verified
raw
history blame contribute delete
994 Bytes
import torchaudio
from torchaudio.functional import resample
import os
from pyannote.audio import Pipeline
os.environ["PYANNOTE_SKIP_DEPENDENCY_CHECK"] = "1"
def process(input_file):
pipeline = Pipeline.from_pretrained("shethjenil/speaker-diarization-community-1")
audio, sr = torchaudio.load(input_file)
target_sr = 16000
if sr != target_sr:
audio = resample(audio, sr, target_sr)
if audio.shape[0] > 1:
audio = audio.mean(dim=0, keepdim=True)
output = pipeline({"waveform":audio,"sample_rate":target_sr})
return {
"diarization":[[i['start'],i['end'],int(i['speaker'].lstrip("SPEAKER_"))] for i in output.serialize()['diarization']],
"exclusive_diarization":[[i['start'],i['end'],int(i['speaker'].lstrip("SPEAKER_"))] for i in output.serialize()['exclusive_diarization']],
"embedding":output.speaker_embeddings.tolist()
}
import gradio as gr
gr.Interface(process, inputs=gr.Audio(type="filepath"), outputs=gr.JSON()).launch()