import torchaudio from torchaudio.functional import resample import os from pyannote.audio import Pipeline os.environ["PYANNOTE_SKIP_DEPENDENCY_CHECK"] = "1" def process(input_file): pipeline = Pipeline.from_pretrained("shethjenil/speaker-diarization-community-1") audio, sr = torchaudio.load(input_file) target_sr = 16000 if sr != target_sr: audio = resample(audio, sr, target_sr) if audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True) output = pipeline({"waveform":audio,"sample_rate":target_sr}) return { "diarization":[[i['start'],i['end'],int(i['speaker'].lstrip("SPEAKER_"))] for i in output.serialize()['diarization']], "exclusive_diarization":[[i['start'],i['end'],int(i['speaker'].lstrip("SPEAKER_"))] for i in output.serialize()['exclusive_diarization']], "embedding":output.speaker_embeddings.tolist() } import gradio as gr gr.Interface(process, inputs=gr.Audio(type="filepath"), outputs=gr.JSON()).launch()