don0726 commited on
Commit
6f1a080
·
verified ·
1 Parent(s): bf19dfb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -61
app.py CHANGED
@@ -1,20 +1,24 @@
1
  import numpy as np
2
  import librosa
3
  import gradio as gr
4
- import torch
5
 
6
- from speechbrain.inference.speaker import EncoderClassifier
 
7
 
8
- # Load SpeechBrain model (CPU)
9
- classifier = EncoderClassifier.from_hparams(
10
- source="speechbrain/spkrec-ecapa-voxceleb",
11
- run_opts={"device": "cpu"}
12
- )
13
 
14
- CHUNK_DURATION = 30
15
- SIMILARITY_THRESHOLD = 0.65
 
 
 
 
 
 
 
16
 
17
- # 🔹 Your SAME segmentation (unchanged)
18
  def split_audio(y, sr, frame_sec=1.0):
19
  frame_len = int(sr * frame_sec)
20
  segments, times = [], []
@@ -30,58 +34,14 @@ def split_audio(y, sr, frame_sec=1.0):
30
 
31
  return segments, times
32
 
33
- # 🔹 NEW: SpeechBrain embedding
34
- def get_embedding(seg, sr):
35
- wav = torch.tensor(seg).unsqueeze(0)
36
- emb = classifier.encode_batch(wav)
37
- emb = emb.squeeze().detach().cpu().numpy()
38
-
39
- # normalize
40
- emb = emb / (np.linalg.norm(emb) + 1e-6)
41
- return emb
42
-
43
- # 🔹 Speaker memory (strong)
44
- class SpeakerMemory:
45
- def __init__(self):
46
- self.db = {}
47
- self.count = 0
48
-
49
- def match(self, emb):
50
- if not self.db:
51
- return self._new(emb)
52
-
53
- best_spk = None
54
- best_score = -1
55
-
56
- for spk, embs in self.db.items():
57
- centroid = np.mean(embs, axis=0)
58
- centroid = centroid / (np.linalg.norm(centroid) + 1e-6)
59
-
60
- score = np.dot(emb, centroid)
61
-
62
- if score > best_score:
63
- best_score = score
64
- best_spk = spk
65
-
66
- if best_score > SIMILARITY_THRESHOLD:
67
- self.db[best_spk].append(emb)
68
- return best_spk
69
- else:
70
- return self._new(emb)
71
-
72
- def _new(self, emb):
73
- self.count += 1
74
- spk = f"SPEAKER_{self.count}"
75
- self.db[spk] = [emb]
76
- return spk
77
-
78
- # 🔹 Main processing
79
  def process_audio(file_path):
80
  y, sr = librosa.load(file_path, sr=None)
81
  total_duration = len(y) / sr
82
 
83
- memory = SpeakerMemory()
84
  all_segments = []
 
 
 
85
 
86
  current_time = 0
87
 
@@ -93,8 +53,33 @@ def process_audio(file_path):
93
  segments, times = split_audio(chunk, sr)
94
 
95
  for seg, (s, e) in zip(segments, times):
96
- emb = get_embedding(seg, sr)
97
- speaker_id = memory.match(emb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  all_segments.append({
100
  "speaker": speaker_id,
@@ -110,14 +95,15 @@ def process_audio(file_path):
110
  def run(audio):
111
  if audio is None:
112
  return {"error": "Upload audio"}
 
113
  return process_audio(audio)
114
 
115
  demo = gr.Interface(
116
  fn=run,
117
  inputs=gr.Audio(type="filepath"),
118
  outputs=gr.JSON(),
119
- title="High Accuracy Speaker Diarization (SpeechBrain)",
120
- description="Hybrid: segmentation + ECAPA embeddings (~90% accuracy)"
121
  )
122
 
123
  demo.launch()
 
1
  import numpy as np
2
  import librosa
3
  import gradio as gr
4
+ from sklearn.preprocessing import StandardScaler
5
 
6
+ CHUNK_DURATION = 30
7
+ SIMILARITY_THRESHOLD = 0.75
8
 
9
+ def extract_features(y, sr):
10
+ features = []
 
 
 
11
 
12
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
13
+ features.extend(np.mean(mfcc, axis=1))
14
+
15
+ features.append(np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)))
16
+ features.append(np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)))
17
+ features.append(np.mean(librosa.feature.zero_crossing_rate(y)))
18
+ features.append(np.mean(librosa.feature.rms(y=y)))
19
+
20
+ return np.array(features)
21
 
 
22
  def split_audio(y, sr, frame_sec=1.0):
23
  frame_len = int(sr * frame_sec)
24
  segments, times = [], []
 
34
 
35
  return segments, times
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def process_audio(file_path):
38
  y, sr = librosa.load(file_path, sr=None)
39
  total_duration = len(y) / sr
40
 
 
41
  all_segments = []
42
+ speaker_embeddings = []
43
+ speaker_labels = []
44
+ speaker_count = 0
45
 
46
  current_time = 0
47
 
 
53
  segments, times = split_audio(chunk, sr)
54
 
55
  for seg, (s, e) in zip(segments, times):
56
+ feat = extract_features(seg, sr)
57
+
58
+ if speaker_embeddings:
59
+ scaler = StandardScaler()
60
+ X = np.vstack(speaker_embeddings + [feat])
61
+ X = scaler.fit_transform(X)
62
+ feat_norm = X[-1]
63
+ existing = X[:-1]
64
+ else:
65
+ feat_norm = feat
66
+ existing = []
67
+
68
+ assigned = False
69
+ for i, emb in enumerate(existing):
70
+ sim = np.dot(feat_norm, emb) / (
71
+ np.linalg.norm(feat_norm) * np.linalg.norm(emb)
72
+ )
73
+ if sim > SIMILARITY_THRESHOLD:
74
+ speaker_id = speaker_labels[i]
75
+ assigned = True
76
+ break
77
+
78
+ if not assigned:
79
+ speaker_count += 1
80
+ speaker_id = f"SPEAKER_{speaker_count}"
81
+ speaker_embeddings.append(feat)
82
+ speaker_labels.append(speaker_id)
83
 
84
  all_segments.append({
85
  "speaker": speaker_id,
 
95
  def run(audio):
96
  if audio is None:
97
  return {"error": "Upload audio"}
98
+
99
  return process_audio(audio)
100
 
101
  demo = gr.Interface(
102
  fn=run,
103
  inputs=gr.Audio(type="filepath"),
104
  outputs=gr.JSON(),
105
+ title="Speaker Diarization (CPU)",
106
+ description="Upload audio get speaker labels with timestamps"
107
  )
108
 
109
  demo.launch()