Spaces:

Bindu36
/

Diarization

Running

App Files Files Community

Bindu36 commited on Jul 2, 2024

Commit

e5ed27b

verified ·

1 Parent(s): e640d9c

Create app.py

Browse files

Files changed (1) hide show

app.py +80 -0

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gradio as gr
+import whisper
+import datetime
+import torch
+import subprocess
+from pyannote.audio import Audio
+from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
+from pyannote.core import Segment
+import wave
+import contextlib
+from sklearn.cluster import AgglomerativeClustering
+import numpy as np
+# Load Whisper model
+model_size = "medium.en"
+model = whisper.load_model(model_size)
+audio = Audio()
+embedding_model = PretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb", device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+def transcribe_and_diarize(audio_file, num_speakers=2):
+    path = audio_file.name
+    # Convert to WAV if necessary
+    if path[-3:] != 'wav':
+        subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
+        path = 'audio.wav'
+    # Transcribe audio
+    result = model.transcribe(path)
+    segments = result["segments"]
+    # Get audio duration
+    with contextlib.closing(wave.open(path, 'r')) as f:
+        frames = f.getnframes()
+        rate = f.getframerate()
+        duration = frames / float(rate)
+    # Define function to extract segment embeddings
+    def segment_embedding(segment):
+        start = segment["start"]
+        end = min(duration, segment["end"])
+        clip = Segment(start, end)
+        waveform, sample_rate = audio.crop(path, clip)
+        return embedding_model(waveform[None])
+    # Extract embeddings for each segment
+    embeddings = np.zeros(shape=(len(segments), 192))
+    for i, segment in enumerate(segments):
+        embeddings[i] = segment_embedding(segment)
+    embeddings = np.nan_to_num(embeddings)
+    # Perform speaker clustering
+    clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
+    labels = clustering.labels_
+    for i in range(len(segments)):
+        segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
+    # Generate transcript
+    transcript = ""
+    for i, segment in enumerate(segments):
+        if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
+            transcript += "\n" + segment["speaker"] + ' ' + str(datetime.timedelta(seconds=round(segment["start"]))) + '\n'
+        transcript += segment["text"][1:] + ' '
+        transcript += "\n\n"
+    return transcript
+iface = gr.Interface(
+    fn=transcribe_and_diarize,
+    inputs=[
+        gr.Audio(source="upload", type="file"),
+        gr.Number(value=2, label="Number of Speakers")
+    ],
+    outputs="text",
+    title="Audio Transcription and Speaker Diarization",
+    description="Upload an audio file to get a transcription with speaker diarization."
+)
+iface.launch()