S-Vetrivel commited on
Commit
62f98bb
·
1 Parent(s): cb14a1d

Heavy & Accurate: Integrated SpeechBrain VAD + MMS-300M pipeline

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. README.md +4 -3
  3. app/infer.py +84 -13
  4. requirements.txt +6 -4
  5. verify_model.py +11 -24
  6. verify_speechbrain.py +50 -0
.gitignore CHANGED
@@ -25,3 +25,5 @@ temp_*
25
  test_audio.py
26
  verify_pipeline.py
27
  test_api.py
 
 
 
25
  test_audio.py
26
  verify_pipeline.py
27
  test_api.py
28
+ test_vad.wav
29
+ tmp_vad_model/
README.md CHANGED
@@ -17,9 +17,9 @@ Built for the **AI-Generated Voice Detection Challenge** with specific support f
17
 
18
  ## 🚀 Features
19
 
20
- - **Multilingual Support**: Uses the **XLS-R (Cross-Lingual Speech Representation)** model (`wav2vec2-large-xlsr-53`) pre-trained on 53 languages.
21
  - **Strict API Specification**: Compliant with challenge requirements (Base64 MP3 input, standardized JSON response).
22
- - **Hybrid Detection**: Combines Deep Learning embeddings with **Acoustic Feature Analysis** (Pitch Variance) for robust detection.
23
  - **Explainability**: Provides human-readable explanations for every decision.
24
  - **Secure**: Protected via `x-api-key` header authentication.
25
 
@@ -28,7 +28,8 @@ Built for the **AI-Generated Voice Detection Challenge** with specific support f
28
  ## 🛠️ Tech Stack
29
 
30
  - **Framework**: FastAPI (Python)
31
- - **Model**: PyTorch + HuggingFace Transformers (`facebook/wav2vec2-large-xlsr-53`)
 
32
  - **Audio Processing**: `pydub` (ffmpeg) + `librosa`
33
  - **Deployment**: Uvicorn
34
 
 
17
 
18
  ## 🚀 Features
19
 
20
+ - **Multilingual Support**: Uses the state-of-the-art **MMS-300M (Massively Multilingual Speech)** model (`nii-yamagishilab/mms-300m-anti-deepfake`) derived from **XLS-R**, supporting 100+ languages including Indic languages.
21
  - **Strict API Specification**: Compliant with challenge requirements (Base64 MP3 input, standardized JSON response).
22
+ - **Smart Hybrid Detection**: Combines Deep Learning embeddings with **Acoustic Heuristics** (Pitch, Flatness, Liveness) for "Conservative Consensus" detection.
23
  - **Explainability**: Provides human-readable explanations for every decision.
24
  - **Secure**: Protected via `x-api-key` header authentication.
25
 
 
28
  ## 🛠️ Tech Stack
29
 
30
  - **Framework**: FastAPI (Python)
31
+ - **Model**: PyTorch + HuggingFace Transformers (`nii-yamagishilab/mms-300m-anti-deepfake`)
32
+ - **Toolkit**: **SpeechBrain** (Environment ready for advanced audio processing)
33
  - **Audio Processing**: `pydub` (ffmpeg) + `librosa`
34
  - **Deployment**: Uvicorn
35
 
app/infer.py CHANGED
@@ -1,10 +1,13 @@
1
- import torch
2
- import torch.nn as nn
3
  import os
4
- import numpy as np
 
5
  import librosa
 
6
  import time
7
- from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
 
 
 
8
  from dotenv import load_dotenv
9
 
10
  load_dotenv()
@@ -30,6 +33,19 @@ class VoiceClassifier:
30
  print(f"Error loading model: {e}")
31
  self.model = None
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def calculate_snr(self, audio_np):
34
  """
35
  Estimate Signal-to-Noise Ratio (SNR) in dB.
@@ -54,6 +70,48 @@ class VoiceClassifier:
54
  except Exception:
55
  return 30.0 # Default to decent SNR if calculation fails
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def predict(self, waveform: torch.Tensor, language: str = "Unknown"):
58
  if self.model is None:
59
  return {"error": "Model not loaded"}
@@ -63,27 +121,39 @@ class VoiceClassifier:
63
  wav_np = waveform.squeeze().cpu().numpy()
64
  sr = 16000
65
 
 
 
 
 
 
66
  t0 = time.time()
 
67
 
68
- # --- SIGNAL QUALITY CHECKS ---
 
 
 
 
 
 
69
  snr_db = self.calculate_snr(wav_np)
70
 
71
- # --- ADVANCED FEATURE EXTRACTION ---
72
  # A. Pitch Analysis
73
  f0, voiced_flag, voiced_probs = librosa.pyin(
74
- wav_np, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr
75
  )
76
  f0_clean = f0[~np.isnan(f0)]
77
  pitch_var = np.std(f0_clean) if len(f0_clean) > 0 else 0.0
78
 
79
  # B. Spectral Flatness
80
- flatness = np.mean(librosa.feature.spectral_flatness(y=wav_np))
81
 
82
  # C. RMS Energy Variance
83
- rms = librosa.feature.rms(y=wav_np)[0]
84
  rms_var = np.std(rms) / (np.mean(rms) + 1e-6)
85
 
86
- # D. Liveness (Pause) Detection
87
  # Count distinct silent intervals (>0.1s)
88
  silent_intervals = librosa.effects.split(wav_np, top_db=20, frame_length=2048, hop_length=512)
89
  num_pauses = 0
@@ -95,12 +165,13 @@ class VoiceClassifier:
95
  num_pauses += 1
96
 
97
  # --- TEMPORAL CONSISTENCY ---
 
98
  chunk_size = 2 * sr
99
  stride = 1 * sr
100
  chunks = []
101
- for i in range(0, len(wav_np) - chunk_size + 1, stride):
102
- chunks.append(wav_np[i : i + chunk_size])
103
- if not chunks: chunks = [wav_np]
104
 
105
  chunk_probs = []
106
  for chunk in chunks:
 
 
 
1
  import os
2
+ import torch
3
+ import torchaudio
4
  import librosa
5
+ import numpy as np
6
  import time
7
+ import shutil
8
+ from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
9
+ from speechbrain.inference.VAD import VAD
10
+ import soundfile as sf
11
  from dotenv import load_dotenv
12
 
13
  load_dotenv()
 
33
  print(f"Error loading model: {e}")
34
  self.model = None
35
 
36
+ # Load SpeechBrain VAD
37
+ try:
38
+ print("Loading SpeechBrain VAD...")
39
+ self.vad_model = VAD.from_hparams(
40
+ source="speechbrain/vad-crdnn-libriparty",
41
+ savedir="tmp_vad_model",
42
+ run_opts={"device": str(self.device)}
43
+ )
44
+ print("SpeechBrain VAD loaded.")
45
+ except Exception as e:
46
+ print(f"Error loading VAD: {e}")
47
+ self.vad_model = None
48
+
49
  def calculate_snr(self, audio_np):
50
  """
51
  Estimate Signal-to-Noise Ratio (SNR) in dB.
 
70
  except Exception:
71
  return 30.0 # Default to decent SNR if calculation fails
72
 
73
+ def apply_vad(self, wav_path):
74
+ """
75
+ Apply VAD to filter out silence/noise.
76
+ Returns cleaned waveform (numpy) or original if failed/empty.
77
+ """
78
+ if self.vad_model is None:
79
+ return None
80
+
81
+ try:
82
+ # Get speech segments
83
+ boundaries = self.vad_model.get_speech_segments(wav_path)
84
+
85
+ # If tensor, convert to list
86
+ if isinstance(boundaries, torch.Tensor):
87
+ boundaries = boundaries.cpu().numpy()
88
+
89
+ # Load original audio
90
+ wav, sr = librosa.load(wav_path, sr=16000)
91
+
92
+ if len(boundaries) == 0:
93
+ print("DEBUG: VAD found no speech. Using original.")
94
+ return wav
95
+
96
+ # Concatenate segments
97
+ cleaned_wavs = []
98
+ for start, end in boundaries:
99
+ start_sample = int(start * sr)
100
+ end_sample = int(end * sr)
101
+ if end_sample > len(wav): end_sample = len(wav)
102
+ cleaned_wavs.append(wav[start_sample:end_sample])
103
+
104
+ if not cleaned_wavs:
105
+ return wav
106
+
107
+ final_wav = np.concatenate(cleaned_wavs)
108
+ print(f"DEBUG: VAD reduced audio from {len(wav)/sr:.2f}s to {len(final_wav)/sr:.2f}s")
109
+ return final_wav
110
+
111
+ except Exception as e:
112
+ print(f"VAD Error: {e}")
113
+ return None
114
+
115
  def predict(self, waveform: torch.Tensor, language: str = "Unknown"):
116
  if self.model is None:
117
  return {"error": "Model not loaded"}
 
121
  wav_np = waveform.squeeze().cpu().numpy()
122
  sr = 16000
123
 
124
+ # Save to temp file for VAD (SpeechBrain prefers files)
125
+ tmp_file = "temp_vad_input.wav"
126
+ sf.write(tmp_file, wav_np, sr)
127
+
128
+ # --- STAGE 1: SPEECHBRAIN VAD ---
129
  t0 = time.time()
130
+ vad_wav = self.apply_vad(tmp_file)
131
 
132
+ # Use VAD audio if valid and not too short, else original
133
+ if vad_wav is not None and len(vad_wav) > sr * 0.5:
134
+ wav_for_analysis = vad_wav
135
+ else:
136
+ wav_for_analysis = wav_np
137
+
138
+ # Signal Quality Checks (on original to capture noise floor)
139
  snr_db = self.calculate_snr(wav_np)
140
 
141
+ # --- ADVANCED FEATURE EXTRACTION (on VAD audio) ---
142
  # A. Pitch Analysis
143
  f0, voiced_flag, voiced_probs = librosa.pyin(
144
+ wav_for_analysis, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr
145
  )
146
  f0_clean = f0[~np.isnan(f0)]
147
  pitch_var = np.std(f0_clean) if len(f0_clean) > 0 else 0.0
148
 
149
  # B. Spectral Flatness
150
+ flatness = np.mean(librosa.feature.spectral_flatness(y=wav_for_analysis))
151
 
152
  # C. RMS Energy Variance
153
+ rms = librosa.feature.rms(y=wav_for_analysis)[0]
154
  rms_var = np.std(rms) / (np.mean(rms) + 1e-6)
155
 
156
+ # D. Liveness (Pause) Detection (Use original to detect gaps)
157
  # Count distinct silent intervals (>0.1s)
158
  silent_intervals = librosa.effects.split(wav_np, top_db=20, frame_length=2048, hop_length=512)
159
  num_pauses = 0
 
165
  num_pauses += 1
166
 
167
  # --- TEMPORAL CONSISTENCY ---
168
+ # Use VAD audio for Deepfake Classification
169
  chunk_size = 2 * sr
170
  stride = 1 * sr
171
  chunks = []
172
+ for i in range(0, len(wav_for_analysis) - chunk_size + 1, stride):
173
+ chunks.append(wav_for_analysis[i : i + chunk_size])
174
+ if not chunks: chunks = [wav_for_analysis]
175
 
176
  chunk_probs = []
177
  for chunk in chunks:
requirements.txt CHANGED
@@ -1,13 +1,15 @@
1
- fastapi
2
  uvicorn
3
  python-dotenv
4
- torch
5
- torchaudio
6
  librosa
7
- numpy
8
  python-multipart
9
  python-jose[cryptography]
10
  passlib[bcrypt]
11
  transformers
12
  pydub
13
  scipy
 
 
 
 
 
1
  uvicorn
2
  python-dotenv
3
+ torch<2.1.0
4
+ torchaudio<2.1.0
5
  librosa
6
+ numpy<2.0.0
7
  python-multipart
8
  python-jose[cryptography]
9
  passlib[bcrypt]
10
  transformers
11
  pydub
12
  scipy
13
+ speechbrain
14
+ huggingface_hub<0.20.0
15
+ soundfile
verify_model.py CHANGED
@@ -1,33 +1,20 @@
1
 
2
  import torch
3
- from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
4
  import numpy as np
5
 
6
- def verify_model():
7
- model_name = "mo-thecreator/Deepfake-audio-detection"
8
- print(f"Loading {model_name}...")
 
 
9
  try:
10
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
11
  model = AutoModelForAudioClassification.from_pretrained(model_name)
12
- print("Model loaded successfully!")
13
-
14
- print("Labels:", model.config.id2label)
15
-
16
- # Create dummy audio (1 second of silence/noise)
17
- # 16000 Hz
18
- dummy_audio = np.random.uniform(-1, 1, 16000)
19
-
20
- inputs = feature_extractor(dummy_audio, sampling_rate=16000, return_tensors="pt")
21
-
22
- with torch.no_grad():
23
- logits = model(**inputs).logits
24
-
25
- print("Logits:", logits)
26
- predicted_class_id = torch.argmax(logits, dim=-1).item()
27
- print("Predicted Label:", model.config.id2label[predicted_class_id])
28
-
29
  except Exception as e:
30
- print(f"Failed to load/run model: {e}")
31
 
32
  if __name__ == "__main__":
33
- verify_model()
 
1
 
2
  import torch
3
+ from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
4
  import numpy as np
5
 
6
+ def check_model():
7
+ model_name = "nii-yamagishilab/mms-300m-anti-deepfake"
8
+ feature_extractor_name = "facebook/mms-300m"
9
+
10
+ print(f"Verifying load for: {model_name}")
11
  try:
12
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(feature_extractor_name)
13
  model = AutoModelForAudioClassification.from_pretrained(model_name)
14
+ print("Success! Model and Extractor loaded.")
15
+ print(f"Classes: {model.config.id2label}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  except Exception as e:
17
+ print(f"Failed: {e}")
18
 
19
  if __name__ == "__main__":
20
+ check_model()
verify_speechbrain.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import torchaudio
4
+ import numpy as np
5
+ from speechbrain.inference.VAD import VAD
6
+
7
+ def verify_vad():
8
+ model_source = "speechbrain/vad-crdnn-libriparty"
9
+ print(f"Loading VAD model: {model_source}...")
10
+
11
+ try:
12
+ # Load VAD
13
+ vad_model = VAD.from_hparams(
14
+ source=model_source,
15
+ savedir="tmp_vad_model",
16
+ run_opts={"device": "cpu"} # Force CPU for verification
17
+ )
18
+ print("VAD Model loaded successfully!")
19
+
20
+ # Create dummy audio (random noise + silence + random noise)
21
+ sr = 16000
22
+ duration = 5 # seconds
23
+ t = np.linspace(0, duration, int(sr * duration))
24
+
25
+ # 1 sec noise, 2 sec silence, 2 sec noise
26
+ audio = np.random.uniform(-0.1, 0.1, int(sr * 1))
27
+ audio = np.concatenate([audio, np.zeros(int(sr * 2))])
28
+ audio = np.concatenate([audio, np.random.uniform(-0.1, 0.1, int(sr * 2))])
29
+
30
+ # Convert to tensor path not needed if we can process tensor
31
+ # SpeechBrain VAD usually expects a file path, but let's check input flexibility
32
+ # For this test, save to a temp file
33
+ import soundfile as sf
34
+ sf.write('test_vad.wav', audio, sr)
35
+
36
+ print("Processing test_vad.wav...")
37
+ # Boundaries usually returns a tensor of [start, end]
38
+ boundaries = vad_model.get_speech_segments("test_vad.wav")
39
+ print(f"Speech Segments found: \n{boundaries}")
40
+
41
+ # Check if it filtered the silence
42
+ print("Verification complete.")
43
+
44
+ except Exception as e:
45
+ print(f"Error: {e}")
46
+ import traceback
47
+ traceback.print_exc()
48
+
49
+ if __name__ == "__main__":
50
+ verify_vad()