Capstone04 commited on
Commit
c710b8a
·
verified ·
1 Parent(s): 45e0229

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -9,11 +9,8 @@ pipeline_tag: automatic-speech-recognition
9
 
10
  This package provides an **Automatic Speech Recognition (ASR) + Speaker Diarization** pipeline using:
11
  - [OpenAI Whisper](https://huggingface.co/openai/whisper-medium)
12
- - [Pyannote diarization](https://huggingface.co/pyannote/speaker-diarization-3.1)
13
 
14
  ## Install
15
  ```bash
16
- pip install git+https://huggingface.co/Capstone04/asr-diarization-pipeline
17
-
18
- ## Speaker Identification
19
- You can now enroll known speakers by providing reference audio samples. The pipeline will match incoming speaker segments against stored embeddings and label them accordingly. Unknown speakers are dynamically tracked per session.
 
9
 
10
  This package provides an **Automatic Speech Recognition (ASR) + Speaker Diarization** pipeline using:
11
  - [OpenAI Whisper](https://huggingface.co/openai/whisper-medium)
12
+ - [SpeechBrain ECAPA-TDNN](https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb)
13
 
14
  ## Install
15
  ```bash
16
+ pip install git+https://huggingface.co/Capstone04/Prayashi_RealTime
 
 
 
asr_diarization/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (236 Bytes). View file
 
asr_diarization/__pycache__/pipeline.cpython-312.pyc ADDED
Binary file (29.5 kB). View file
 
asr_diarization/inference.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  from .pipeline import ASR_Diarization
3
 
 
4
  import json
5
  import numpy as np
6
 
@@ -13,7 +14,7 @@ def load_known_embeddings(path="known_speakers.json"):
13
 
14
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
15
  known_embeddings = load_known_embeddings()
16
- pipe = ASR_Diarization(HF_TOKEN, known_embeddings=known_embeddings)
17
 
18
  def inference(inputs):
19
  return pipe(inputs)
 
1
  import os
2
  from .pipeline import ASR_Diarization
3
 
4
+
5
  import json
6
  import numpy as np
7
 
 
14
 
15
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
16
  known_embeddings = load_known_embeddings()
17
+ pipe = ASR_Diarization(HF_TOKEN)
18
 
19
  def inference(inputs):
20
  return pipe(inputs)
asr_diarization/pipeline.py CHANGED
@@ -1,12 +1,6 @@
1
  import os
2
  import json
3
  import torch
4
-
5
- # Fix TF32 reproducibility warning and potential computation issues
6
- if torch.cuda.is_available():
7
- torch.backends.cuda.matmul.allow_tf32 = True
8
- torch.backends.cudnn.allow_tf32 = True
9
-
10
  import tempfile
11
  import torchaudio
12
  import threading
@@ -15,21 +9,18 @@ import soundfile as sf
15
  import noisereduce as nr
16
  from scipy import signal
17
  from numpy.linalg import norm
18
- from pyannote.audio import Pipeline
19
- from speechbrain.pretrained import EncoderClassifier
20
- from pyannote.core import Annotation, Segment
21
  from transformers import pipeline as hf_pipeline
22
- from pyannote.metrics.diarization import DiarizationErrorRate
23
  from jiwer import wer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip
24
 
25
  class ASR_Diarization:
26
- def __init__(self, HF_TOKEN,
27
- diar_model="pyannote/speaker-diarization-3.1",
28
- asr_model="openai/whisper-medium"):
29
  self.HF_TOKEN = HF_TOKEN
30
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
  self._unknown_lock = threading.Lock()
32
 
 
33
  try:
34
  self.embedding_model = EncoderClassifier.from_hparams(
35
  source="speechbrain/spkrec-ecapa-voxceleb",
@@ -40,9 +31,17 @@ class ASR_Diarization:
40
  self.embedding_model = None
41
  print(f"[ERROR] Failed to load ECAPA: {e}")
42
 
43
- self.diar_pipeline = Pipeline.from_pretrained(diar_model, use_auth_token=None)
44
- diar_model="pyannote/speaker-diarization-3.1"
 
 
 
 
 
 
 
45
 
 
46
  device_index = 0 if torch.cuda.is_available() else -1
47
  self.asr_pipeline = hf_pipeline(
48
  "automatic-speech-recognition",
@@ -52,11 +51,70 @@ diar_model="pyannote/speaker-diarization-3.1"
52
  )
53
 
54
  def run_diarization(self, audio_path):
55
- diarization = self.diar_pipeline(audio_path)
56
- return [
57
- {"start": t.start, "end": t.end, "speaker": spk}
58
- for t, _, spk in diarization.itertracks(yield_label=True)
59
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def load_unknown_speakers(self, unknown_speakers_path):
62
  if os.path.exists(unknown_speakers_path):
@@ -123,7 +181,6 @@ diar_model="pyannote/speaker-diarization-3.1"
123
 
124
  return best_name, best_score, is_enrolled
125
 
126
-
127
  def run_transcription(self, audio_path, diar_json, enrolled_speakers=None, unknown_speakers_path=None):
128
  unknown_speakers_path = unknown_speakers_path or os.path.join(os.path.dirname(audio_path), "unknown_speakers.json")
129
 
@@ -137,6 +194,8 @@ diar_model="pyannote/speaker-diarization-3.1"
137
  enrolled_speakers_np = {n: v/norm(v) for n,v in (enrolled_speakers or {}).items() if norm(v) > 0}
138
 
139
  target_sr = 16000
 
 
140
  clusters = {}
141
  for seg in diar_json:
142
  clusters.setdefault(seg["speaker"], []).append(seg)
@@ -163,7 +222,7 @@ diar_model="pyannote/speaker-diarization-3.1"
163
  cluster_emb = np.mean(np.stack(seg_embs), axis=0)
164
  cluster_embeddings[cluster_label] = cluster_emb / norm(cluster_emb)
165
 
166
- speaker_map, speakers_updated = {}, False
167
  threshold = 0.5
168
 
169
  # Thread-safe unknown speaker update
@@ -191,7 +250,7 @@ diar_model="pyannote/speaker-diarization-3.1"
191
  if speakers_updated:
192
  self.save_unknown_speakers(unknown_speakers, unknown_speakers_path)
193
 
194
- # ASR transcription (same as before)
195
  for seg in diar_json:
196
  start, end, spk = seg["start"], seg["end"], seg["speaker"]
197
  start_sample, end_sample = int(start*sr), int(end*sr)
@@ -276,19 +335,6 @@ diar_model="pyannote/speaker-diarization-3.1"
276
  hyp_rttm = os.path.join(output_dir, f"{base_name}.rttm")
277
  hyp_json = os.path.join(output_dir, f"{base_name}_merged_transcription.json")
278
 
279
- if ref_rttm:
280
- def load_rttm(path):
281
- ann = Annotation()
282
- for line in open(path):
283
- if line.startswith("SPEAKER"):
284
- p = line.split()
285
- start, dur, spk = float(p[3]), float(p[4]), p[7]
286
- ann[Segment(start, start+dur)] = spk
287
- return ann
288
-
289
- der_score = DiarizationErrorRate()(load_rttm(ref_rttm), load_rttm(hyp_rttm))
290
- results["DER"] = round(der_score * 100, 2)
291
-
292
  if ref_json:
293
  def load_words(path):
294
  data = json.load(open(path))
 
1
  import os
2
  import json
3
  import torch
 
 
 
 
 
 
4
  import tempfile
5
  import torchaudio
6
  import threading
 
9
  import noisereduce as nr
10
  from scipy import signal
11
  from numpy.linalg import norm
12
+ from speechbrain.pretrained import SpeakerRecognition, EncoderClassifier
13
+ from speechbrain.pretrained import SpectralMaskEnhancement
 
14
  from transformers import pipeline as hf_pipeline
 
15
  from jiwer import wer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip
16
 
17
  class ASR_Diarization:
18
+ def __init__(self, HF_TOKEN, asr_model="openai/whisper-medium"):
 
 
19
  self.HF_TOKEN = HF_TOKEN
20
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
  self._unknown_lock = threading.Lock()
22
 
23
+ # Load SpeechBrain models
24
  try:
25
  self.embedding_model = EncoderClassifier.from_hparams(
26
  source="speechbrain/spkrec-ecapa-voxceleb",
 
31
  self.embedding_model = None
32
  print(f"[ERROR] Failed to load ECAPA: {e}")
33
 
34
+ try:
35
+ self.speaker_diarization = SpeakerRecognition.from_hparams(
36
+ source="speechbrain/spkrec-ecapa-voxceleb",
37
+ savedir="pretrained_models/spkrec-ecapa-voxceleb"
38
+ )
39
+ print("[Speaker Recognition] Model loaded successfully.")
40
+ except Exception as e:
41
+ self.speaker_diarization = None
42
+ print(f"[ERROR] Failed to load Speaker Recognition: {e}")
43
 
44
+ # Load ASR pipeline
45
  device_index = 0 if torch.cuda.is_available() else -1
46
  self.asr_pipeline = hf_pipeline(
47
  "automatic-speech-recognition",
 
51
  )
52
 
53
  def run_diarization(self, audio_path):
54
+ """Simple diarization using SpeechBrain embedding clustering"""
55
+ audio, sr = torchaudio.load(audio_path)
56
+ audio_np = audio[0].numpy() if audio.shape[0] == 1 else audio.mean(dim=0).numpy()
57
+
58
+ # Segment audio into chunks for diarization
59
+ chunk_duration = 2.0 # 2-second chunks
60
+ chunk_size = int(chunk_duration * sr)
61
+ segments = []
62
+
63
+ for i in range(0, len(audio_np), chunk_size):
64
+ start_time = i / sr
65
+ end_time = min((i + chunk_size) / sr, len(audio_np) / sr)
66
+ chunk = audio_np[i:i+chunk_size]
67
+
68
+ if len(chunk) < 8000: # Skip very short chunks
69
+ continue
70
+
71
+ # Get speaker embedding for this chunk
72
+ if self.embedding_model:
73
+ try:
74
+ chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).to(self.device)
75
+ with torch.no_grad():
76
+ embedding = self.embedding_model.encode_batch(chunk_tensor).squeeze().cpu().numpy()
77
+
78
+ # Simple speaker assignment based on embedding similarity
79
+ speaker_id = self._assign_speaker(embedding, segments)
80
+
81
+ segments.append({
82
+ "start": start_time,
83
+ "end": end_time,
84
+ "speaker": speaker_id,
85
+ "embedding": embedding
86
+ })
87
+ except Exception as e:
88
+ print(f"Error processing chunk: {e}")
89
+ continue
90
+
91
+ return segments
92
+
93
+ def _assign_speaker(self, embedding, existing_segments, threshold=0.7):
94
+ """Assign speaker based on embedding similarity"""
95
+ if not existing_segments:
96
+ return "speaker_1"
97
+
98
+ # Calculate similarity with existing speakers
99
+ similarities = []
100
+ for seg in existing_segments[-10:]: # Check last 10 segments
101
+ if "embedding" in seg:
102
+ sim = np.dot(embedding.flatten(), seg["embedding"].flatten()) / (
103
+ norm(embedding.flatten()) * norm(seg["embedding"].flatten())
104
+ )
105
+ similarities.append((seg["speaker"], sim))
106
+
107
+ if similarities:
108
+ best_speaker, best_sim = max(similarities, key=lambda x: x[1])
109
+ if best_sim > threshold:
110
+ return best_speaker
111
+
112
+ # Create new speaker
113
+ existing_speakers = set(seg["speaker"] for seg in existing_segments)
114
+ speaker_num = 1
115
+ while f"speaker_{speaker_num}" in existing_speakers:
116
+ speaker_num += 1
117
+ return f"speaker_{speaker_num}"
118
 
119
  def load_unknown_speakers(self, unknown_speakers_path):
120
  if os.path.exists(unknown_speakers_path):
 
181
 
182
  return best_name, best_score, is_enrolled
183
 
 
184
  def run_transcription(self, audio_path, diar_json, enrolled_speakers=None, unknown_speakers_path=None):
185
  unknown_speakers_path = unknown_speakers_path or os.path.join(os.path.dirname(audio_path), "unknown_speakers.json")
186
 
 
194
  enrolled_speakers_np = {n: v/norm(v) for n,v in (enrolled_speakers or {}).items() if norm(v) > 0}
195
 
196
  target_sr = 16000
197
+
198
+ # Group segments by speaker for clustering
199
  clusters = {}
200
  for seg in diar_json:
201
  clusters.setdefault(seg["speaker"], []).append(seg)
 
222
  cluster_emb = np.mean(np.stack(seg_embs), axis=0)
223
  cluster_embeddings[cluster_label] = cluster_emb / norm(cluster_emb)
224
 
225
+ speaker_map, speakers_updated = {}, {}
226
  threshold = 0.5
227
 
228
  # Thread-safe unknown speaker update
 
250
  if speakers_updated:
251
  self.save_unknown_speakers(unknown_speakers, unknown_speakers_path)
252
 
253
+ # ASR transcription
254
  for seg in diar_json:
255
  start, end, spk = seg["start"], seg["end"], seg["speaker"]
256
  start_sample, end_sample = int(start*sr), int(end*sr)
 
335
  hyp_rttm = os.path.join(output_dir, f"{base_name}.rttm")
336
  hyp_json = os.path.join(output_dir, f"{base_name}_merged_transcription.json")
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  if ref_json:
339
  def load_words(path):
340
  data = json.load(open(path))
requirements.txt CHANGED
@@ -1,10 +1,8 @@
1
- torch>=2.3.0
2
- torchaudio>=2.3.0
3
- pyannote.audio==3.1.1
4
- transformers>=4.41.0
5
- huggingface_hub>=0.24.0
6
  noisereduce
7
  scikit-learn
8
  jiwer
9
  librosa
10
- speechbrain
 
1
+ torch
2
+ torchaudio
3
+ speechbrain
4
+ transformers
 
5
  noisereduce
6
  scikit-learn
7
  jiwer
8
  librosa
 
setup.py CHANGED
@@ -7,7 +7,7 @@ setup(
7
  install_requires=[
8
  "torch",
9
  "torchaudio",
10
- "pyannote.audio",
11
  "transformers",
12
  "noisereduce",
13
  "scikit-learn",
 
7
  install_requires=[
8
  "torch",
9
  "torchaudio",
10
+ "speechbrain",
11
  "transformers",
12
  "noisereduce",
13
  "scikit-learn",