Upload folder using huggingface_hub
Browse files- README.md +2 -5
- asr_diarization/__pycache__/__init__.cpython-312.pyc +0 -0
- asr_diarization/__pycache__/pipeline.cpython-312.pyc +0 -0
- asr_diarization/inference.py +2 -1
- asr_diarization/pipeline.py +82 -36
- requirements.txt +4 -6
- setup.py +1 -1
README.md
CHANGED
|
@@ -9,11 +9,8 @@ pipeline_tag: automatic-speech-recognition
|
|
| 9 |
|
| 10 |
This package provides an **Automatic Speech Recognition (ASR) + Speaker Diarization** pipeline using:
|
| 11 |
- [OpenAI Whisper](https://huggingface.co/openai/whisper-medium)
|
| 12 |
-
- [
|
| 13 |
|
| 14 |
## Install
|
| 15 |
```bash
|
| 16 |
-
pip install git+https://huggingface.co/Capstone04/
|
| 17 |
-
|
| 18 |
-
## Speaker Identification
|
| 19 |
-
You can now enroll known speakers by providing reference audio samples. The pipeline will match incoming speaker segments against stored embeddings and label them accordingly. Unknown speakers are dynamically tracked per session.
|
|
|
|
| 9 |
|
| 10 |
This package provides an **Automatic Speech Recognition (ASR) + Speaker Diarization** pipeline using:
|
| 11 |
- [OpenAI Whisper](https://huggingface.co/openai/whisper-medium)
|
| 12 |
+
- [SpeechBrain ECAPA-TDNN](https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb)
|
| 13 |
|
| 14 |
## Install
|
| 15 |
```bash
|
| 16 |
+
pip install git+https://huggingface.co/Capstone04/Prayashi_RealTime
|
|
|
|
|
|
|
|
|
asr_diarization/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (236 Bytes). View file
|
|
|
asr_diarization/__pycache__/pipeline.cpython-312.pyc
ADDED
|
Binary file (29.5 kB). View file
|
|
|
asr_diarization/inference.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
from .pipeline import ASR_Diarization
|
| 3 |
|
|
|
|
| 4 |
import json
|
| 5 |
import numpy as np
|
| 6 |
|
|
@@ -13,7 +14,7 @@ def load_known_embeddings(path="known_speakers.json"):
|
|
| 13 |
|
| 14 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 15 |
known_embeddings = load_known_embeddings()
|
| 16 |
-
pipe = ASR_Diarization(HF_TOKEN
|
| 17 |
|
| 18 |
def inference(inputs):
|
| 19 |
return pipe(inputs)
|
|
|
|
| 1 |
import os
|
| 2 |
from .pipeline import ASR_Diarization
|
| 3 |
|
| 4 |
+
|
| 5 |
import json
|
| 6 |
import numpy as np
|
| 7 |
|
|
|
|
| 14 |
|
| 15 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 16 |
known_embeddings = load_known_embeddings()
|
| 17 |
+
pipe = ASR_Diarization(HF_TOKEN)
|
| 18 |
|
| 19 |
def inference(inputs):
|
| 20 |
return pipe(inputs)
|
asr_diarization/pipeline.py
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import torch
|
| 4 |
-
|
| 5 |
-
# Fix TF32 reproducibility warning and potential computation issues
|
| 6 |
-
if torch.cuda.is_available():
|
| 7 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
| 8 |
-
torch.backends.cudnn.allow_tf32 = True
|
| 9 |
-
|
| 10 |
import tempfile
|
| 11 |
import torchaudio
|
| 12 |
import threading
|
|
@@ -15,21 +9,18 @@ import soundfile as sf
|
|
| 15 |
import noisereduce as nr
|
| 16 |
from scipy import signal
|
| 17 |
from numpy.linalg import norm
|
| 18 |
-
from
|
| 19 |
-
from speechbrain.pretrained import
|
| 20 |
-
from pyannote.core import Annotation, Segment
|
| 21 |
from transformers import pipeline as hf_pipeline
|
| 22 |
-
from pyannote.metrics.diarization import DiarizationErrorRate
|
| 23 |
from jiwer import wer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip
|
| 24 |
|
| 25 |
class ASR_Diarization:
|
| 26 |
-
def __init__(self, HF_TOKEN,
|
| 27 |
-
diar_model="pyannote/speaker-diarization-3.1",
|
| 28 |
-
asr_model="openai/whisper-medium"):
|
| 29 |
self.HF_TOKEN = HF_TOKEN
|
| 30 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 31 |
self._unknown_lock = threading.Lock()
|
| 32 |
|
|
|
|
| 33 |
try:
|
| 34 |
self.embedding_model = EncoderClassifier.from_hparams(
|
| 35 |
source="speechbrain/spkrec-ecapa-voxceleb",
|
|
@@ -40,9 +31,17 @@ class ASR_Diarization:
|
|
| 40 |
self.embedding_model = None
|
| 41 |
print(f"[ERROR] Failed to load ECAPA: {e}")
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
|
|
|
| 46 |
device_index = 0 if torch.cuda.is_available() else -1
|
| 47 |
self.asr_pipeline = hf_pipeline(
|
| 48 |
"automatic-speech-recognition",
|
|
@@ -52,11 +51,70 @@ diar_model="pyannote/speaker-diarization-3.1"
|
|
| 52 |
)
|
| 53 |
|
| 54 |
def run_diarization(self, audio_path):
|
| 55 |
-
diarization
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
def load_unknown_speakers(self, unknown_speakers_path):
|
| 62 |
if os.path.exists(unknown_speakers_path):
|
|
@@ -123,7 +181,6 @@ diar_model="pyannote/speaker-diarization-3.1"
|
|
| 123 |
|
| 124 |
return best_name, best_score, is_enrolled
|
| 125 |
|
| 126 |
-
|
| 127 |
def run_transcription(self, audio_path, diar_json, enrolled_speakers=None, unknown_speakers_path=None):
|
| 128 |
unknown_speakers_path = unknown_speakers_path or os.path.join(os.path.dirname(audio_path), "unknown_speakers.json")
|
| 129 |
|
|
@@ -137,6 +194,8 @@ diar_model="pyannote/speaker-diarization-3.1"
|
|
| 137 |
enrolled_speakers_np = {n: v/norm(v) for n,v in (enrolled_speakers or {}).items() if norm(v) > 0}
|
| 138 |
|
| 139 |
target_sr = 16000
|
|
|
|
|
|
|
| 140 |
clusters = {}
|
| 141 |
for seg in diar_json:
|
| 142 |
clusters.setdefault(seg["speaker"], []).append(seg)
|
|
@@ -163,7 +222,7 @@ diar_model="pyannote/speaker-diarization-3.1"
|
|
| 163 |
cluster_emb = np.mean(np.stack(seg_embs), axis=0)
|
| 164 |
cluster_embeddings[cluster_label] = cluster_emb / norm(cluster_emb)
|
| 165 |
|
| 166 |
-
speaker_map, speakers_updated = {},
|
| 167 |
threshold = 0.5
|
| 168 |
|
| 169 |
# Thread-safe unknown speaker update
|
|
@@ -191,7 +250,7 @@ diar_model="pyannote/speaker-diarization-3.1"
|
|
| 191 |
if speakers_updated:
|
| 192 |
self.save_unknown_speakers(unknown_speakers, unknown_speakers_path)
|
| 193 |
|
| 194 |
-
# ASR transcription
|
| 195 |
for seg in diar_json:
|
| 196 |
start, end, spk = seg["start"], seg["end"], seg["speaker"]
|
| 197 |
start_sample, end_sample = int(start*sr), int(end*sr)
|
|
@@ -276,19 +335,6 @@ diar_model="pyannote/speaker-diarization-3.1"
|
|
| 276 |
hyp_rttm = os.path.join(output_dir, f"{base_name}.rttm")
|
| 277 |
hyp_json = os.path.join(output_dir, f"{base_name}_merged_transcription.json")
|
| 278 |
|
| 279 |
-
if ref_rttm:
|
| 280 |
-
def load_rttm(path):
|
| 281 |
-
ann = Annotation()
|
| 282 |
-
for line in open(path):
|
| 283 |
-
if line.startswith("SPEAKER"):
|
| 284 |
-
p = line.split()
|
| 285 |
-
start, dur, spk = float(p[3]), float(p[4]), p[7]
|
| 286 |
-
ann[Segment(start, start+dur)] = spk
|
| 287 |
-
return ann
|
| 288 |
-
|
| 289 |
-
der_score = DiarizationErrorRate()(load_rttm(ref_rttm), load_rttm(hyp_rttm))
|
| 290 |
-
results["DER"] = round(der_score * 100, 2)
|
| 291 |
-
|
| 292 |
if ref_json:
|
| 293 |
def load_words(path):
|
| 294 |
data = json.load(open(path))
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import tempfile
|
| 5 |
import torchaudio
|
| 6 |
import threading
|
|
|
|
| 9 |
import noisereduce as nr
|
| 10 |
from scipy import signal
|
| 11 |
from numpy.linalg import norm
|
| 12 |
+
from speechbrain.pretrained import SpeakerRecognition, EncoderClassifier
|
| 13 |
+
from speechbrain.pretrained import SpectralMaskEnhancement
|
|
|
|
| 14 |
from transformers import pipeline as hf_pipeline
|
|
|
|
| 15 |
from jiwer import wer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip
|
| 16 |
|
| 17 |
class ASR_Diarization:
|
| 18 |
+
def __init__(self, HF_TOKEN, asr_model="openai/whisper-medium"):
|
|
|
|
|
|
|
| 19 |
self.HF_TOKEN = HF_TOKEN
|
| 20 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 21 |
self._unknown_lock = threading.Lock()
|
| 22 |
|
| 23 |
+
# Load SpeechBrain models
|
| 24 |
try:
|
| 25 |
self.embedding_model = EncoderClassifier.from_hparams(
|
| 26 |
source="speechbrain/spkrec-ecapa-voxceleb",
|
|
|
|
| 31 |
self.embedding_model = None
|
| 32 |
print(f"[ERROR] Failed to load ECAPA: {e}")
|
| 33 |
|
| 34 |
+
try:
|
| 35 |
+
self.speaker_diarization = SpeakerRecognition.from_hparams(
|
| 36 |
+
source="speechbrain/spkrec-ecapa-voxceleb",
|
| 37 |
+
savedir="pretrained_models/spkrec-ecapa-voxceleb"
|
| 38 |
+
)
|
| 39 |
+
print("[Speaker Recognition] Model loaded successfully.")
|
| 40 |
+
except Exception as e:
|
| 41 |
+
self.speaker_diarization = None
|
| 42 |
+
print(f"[ERROR] Failed to load Speaker Recognition: {e}")
|
| 43 |
|
| 44 |
+
# Load ASR pipeline
|
| 45 |
device_index = 0 if torch.cuda.is_available() else -1
|
| 46 |
self.asr_pipeline = hf_pipeline(
|
| 47 |
"automatic-speech-recognition",
|
|
|
|
| 51 |
)
|
| 52 |
|
| 53 |
def run_diarization(self, audio_path):
|
| 54 |
+
"""Simple diarization using SpeechBrain embedding clustering"""
|
| 55 |
+
audio, sr = torchaudio.load(audio_path)
|
| 56 |
+
audio_np = audio[0].numpy() if audio.shape[0] == 1 else audio.mean(dim=0).numpy()
|
| 57 |
+
|
| 58 |
+
# Segment audio into chunks for diarization
|
| 59 |
+
chunk_duration = 2.0 # 2-second chunks
|
| 60 |
+
chunk_size = int(chunk_duration * sr)
|
| 61 |
+
segments = []
|
| 62 |
+
|
| 63 |
+
for i in range(0, len(audio_np), chunk_size):
|
| 64 |
+
start_time = i / sr
|
| 65 |
+
end_time = min((i + chunk_size) / sr, len(audio_np) / sr)
|
| 66 |
+
chunk = audio_np[i:i+chunk_size]
|
| 67 |
+
|
| 68 |
+
if len(chunk) < 8000: # Skip very short chunks
|
| 69 |
+
continue
|
| 70 |
+
|
| 71 |
+
# Get speaker embedding for this chunk
|
| 72 |
+
if self.embedding_model:
|
| 73 |
+
try:
|
| 74 |
+
chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).to(self.device)
|
| 75 |
+
with torch.no_grad():
|
| 76 |
+
embedding = self.embedding_model.encode_batch(chunk_tensor).squeeze().cpu().numpy()
|
| 77 |
+
|
| 78 |
+
# Simple speaker assignment based on embedding similarity
|
| 79 |
+
speaker_id = self._assign_speaker(embedding, segments)
|
| 80 |
+
|
| 81 |
+
segments.append({
|
| 82 |
+
"start": start_time,
|
| 83 |
+
"end": end_time,
|
| 84 |
+
"speaker": speaker_id,
|
| 85 |
+
"embedding": embedding
|
| 86 |
+
})
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"Error processing chunk: {e}")
|
| 89 |
+
continue
|
| 90 |
+
|
| 91 |
+
return segments
|
| 92 |
+
|
| 93 |
+
def _assign_speaker(self, embedding, existing_segments, threshold=0.7):
|
| 94 |
+
"""Assign speaker based on embedding similarity"""
|
| 95 |
+
if not existing_segments:
|
| 96 |
+
return "speaker_1"
|
| 97 |
+
|
| 98 |
+
# Calculate similarity with existing speakers
|
| 99 |
+
similarities = []
|
| 100 |
+
for seg in existing_segments[-10:]: # Check last 10 segments
|
| 101 |
+
if "embedding" in seg:
|
| 102 |
+
sim = np.dot(embedding.flatten(), seg["embedding"].flatten()) / (
|
| 103 |
+
norm(embedding.flatten()) * norm(seg["embedding"].flatten())
|
| 104 |
+
)
|
| 105 |
+
similarities.append((seg["speaker"], sim))
|
| 106 |
+
|
| 107 |
+
if similarities:
|
| 108 |
+
best_speaker, best_sim = max(similarities, key=lambda x: x[1])
|
| 109 |
+
if best_sim > threshold:
|
| 110 |
+
return best_speaker
|
| 111 |
+
|
| 112 |
+
# Create new speaker
|
| 113 |
+
existing_speakers = set(seg["speaker"] for seg in existing_segments)
|
| 114 |
+
speaker_num = 1
|
| 115 |
+
while f"speaker_{speaker_num}" in existing_speakers:
|
| 116 |
+
speaker_num += 1
|
| 117 |
+
return f"speaker_{speaker_num}"
|
| 118 |
|
| 119 |
def load_unknown_speakers(self, unknown_speakers_path):
|
| 120 |
if os.path.exists(unknown_speakers_path):
|
|
|
|
| 181 |
|
| 182 |
return best_name, best_score, is_enrolled
|
| 183 |
|
|
|
|
| 184 |
def run_transcription(self, audio_path, diar_json, enrolled_speakers=None, unknown_speakers_path=None):
|
| 185 |
unknown_speakers_path = unknown_speakers_path or os.path.join(os.path.dirname(audio_path), "unknown_speakers.json")
|
| 186 |
|
|
|
|
| 194 |
enrolled_speakers_np = {n: v/norm(v) for n,v in (enrolled_speakers or {}).items() if norm(v) > 0}
|
| 195 |
|
| 196 |
target_sr = 16000
|
| 197 |
+
|
| 198 |
+
# Group segments by speaker for clustering
|
| 199 |
clusters = {}
|
| 200 |
for seg in diar_json:
|
| 201 |
clusters.setdefault(seg["speaker"], []).append(seg)
|
|
|
|
| 222 |
cluster_emb = np.mean(np.stack(seg_embs), axis=0)
|
| 223 |
cluster_embeddings[cluster_label] = cluster_emb / norm(cluster_emb)
|
| 224 |
|
| 225 |
+
speaker_map, speakers_updated = {}, {}
|
| 226 |
threshold = 0.5
|
| 227 |
|
| 228 |
# Thread-safe unknown speaker update
|
|
|
|
| 250 |
if speakers_updated:
|
| 251 |
self.save_unknown_speakers(unknown_speakers, unknown_speakers_path)
|
| 252 |
|
| 253 |
+
# ASR transcription
|
| 254 |
for seg in diar_json:
|
| 255 |
start, end, spk = seg["start"], seg["end"], seg["speaker"]
|
| 256 |
start_sample, end_sample = int(start*sr), int(end*sr)
|
|
|
|
| 335 |
hyp_rttm = os.path.join(output_dir, f"{base_name}.rttm")
|
| 336 |
hyp_json = os.path.join(output_dir, f"{base_name}_merged_transcription.json")
|
| 337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
if ref_json:
|
| 339 |
def load_words(path):
|
| 340 |
data = json.load(open(path))
|
requirements.txt
CHANGED
|
@@ -1,10 +1,8 @@
|
|
| 1 |
-
torch
|
| 2 |
-
torchaudio
|
| 3 |
-
|
| 4 |
-
transformers
|
| 5 |
-
huggingface_hub>=0.24.0
|
| 6 |
noisereduce
|
| 7 |
scikit-learn
|
| 8 |
jiwer
|
| 9 |
librosa
|
| 10 |
-
speechbrain
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
torchaudio
|
| 3 |
+
speechbrain
|
| 4 |
+
transformers
|
|
|
|
| 5 |
noisereduce
|
| 6 |
scikit-learn
|
| 7 |
jiwer
|
| 8 |
librosa
|
|
|
setup.py
CHANGED
|
@@ -7,7 +7,7 @@ setup(
|
|
| 7 |
install_requires=[
|
| 8 |
"torch",
|
| 9 |
"torchaudio",
|
| 10 |
-
"
|
| 11 |
"transformers",
|
| 12 |
"noisereduce",
|
| 13 |
"scikit-learn",
|
|
|
|
| 7 |
install_requires=[
|
| 8 |
"torch",
|
| 9 |
"torchaudio",
|
| 10 |
+
"speechbrain",
|
| 11 |
"transformers",
|
| 12 |
"noisereduce",
|
| 13 |
"scikit-learn",
|