bhuvanabala0504 commited on
Commit
8cac351
·
verified ·
1 Parent(s): d39f300

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +75 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import datetime
5
+ from pyannote.audio import Audio
6
+ from pyannote.core import Segment
7
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
8
+ from sklearn.cluster import AgglomerativeClustering
9
+
10
+ # Load the model (runs once when the Space starts)
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ embedding_model = PretrainedSpeakerEmbedding(
13
+ "speechbrain/spkrec-ecapa-voxceleb",
14
+ device=device
15
+ )
16
+ audio_helper = Audio()
17
+
18
+ def time_str(secs):
19
+ return str(datetime.timedelta(seconds=round(secs)))
20
+
21
+ def process_audio(audio_file, num_speakers):
22
+ # 'audio_file' is the path to the uploaded file provided by Gradio
23
+ duration = audio_helper.get_duration(audio_file)
24
+
25
+ # 1. Extract Voiceprints (Embeddings)
26
+ step = 2.0
27
+ embeddings = []
28
+ timestamps = []
29
+
30
+ for start in np.arange(0, duration, step):
31
+ end = min(duration, start + step)
32
+ clip = Segment(start, end)
33
+ waveform, sample_rate = audio_helper.crop(audio_file, clip)
34
+
35
+ # Ensure mono for the model
36
+ if waveform.shape[0] > 1:
37
+ waveform = waveform.mean(dim=0, keepdim=True)
38
+
39
+ embeddings.append(embedding_model(waveform[None]))
40
+ timestamps.append((start, end))
41
+
42
+ embeddings = np.nan_to_num(np.array(embeddings))
43
+
44
+ # 2. Perform Clustering based on user input (num_speakers)
45
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
46
+ labels = clustering.labels_
47
+
48
+ # 3. Create the Output String
49
+ result = "--- SPEAKER DIARIZATION TIMELINE ---\n\n"
50
+ current_speaker = None
51
+
52
+ for i, label in enumerate(labels):
53
+ speaker_name = f"Speaker {label + 1}"
54
+ start, end = timestamps[i]
55
+
56
+ if speaker_name != current_speaker:
57
+ result += f"[{time_str(start)}] {speaker_name} starts speaking\n"
58
+ current_speaker = speaker_name
59
+
60
+ return result
61
+
62
+ # 4. Define the Gradio Interface
63
+ with gr.Blocks() as demo:
64
+ gr.Markdown("# 🎙️ Speaker Diarization Tool")
65
+
66
+ with gr.Row():
67
+ input_audio = gr.Audio(type="filepath", label="1. Upload or Record Audio")
68
+ num_spks = gr.Slider(minimum=1, maximum=10, value=2, step=1, label="2. Number of Speakers")
69
+
70
+ btn = gr.Button("Analyze Speakers")
71
+ output_text = gr.Textbox(label="3. Diarization Results", lines=10)
72
+
73
+ btn.click(fn=process_audio, inputs=[input_audio, num_spks], outputs=output_text)
74
+
75
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pyannote.audio
2
+ speechbrain
3
+ gradio
4
+ torch
5
+ numpy
6
+ scikit-learn