| import torchaudio | |
| from torchaudio.functional import resample | |
| import os | |
| from pyannote.audio import Pipeline | |
| os.environ["PYANNOTE_SKIP_DEPENDENCY_CHECK"] = "1" | |
| def process(input_file): | |
| pipeline = Pipeline.from_pretrained("shethjenil/speaker-diarization-community-1") | |
| audio, sr = torchaudio.load(input_file) | |
| target_sr = 16000 | |
| if sr != target_sr: | |
| audio = resample(audio, sr, target_sr) | |
| if audio.shape[0] > 1: | |
| audio = audio.mean(dim=0, keepdim=True) | |
| output = pipeline({"waveform":audio,"sample_rate":target_sr}) | |
| return { | |
| "diarization":[[i['start'],i['end'],int(i['speaker'].lstrip("SPEAKER_"))] for i in output.serialize()['diarization']], | |
| "exclusive_diarization":[[i['start'],i['end'],int(i['speaker'].lstrip("SPEAKER_"))] for i in output.serialize()['exclusive_diarization']], | |
| "embedding":output.speaker_embeddings.tolist() | |
| } | |
| import gradio as gr | |
| gr.Interface(process, inputs=gr.Audio(type="filepath"), outputs=gr.JSON()).launch() |