Spaces:

S-Vetrivel
/

VoiceGuard-API

Sleeping

App Files Files Community

S-Vetrivel commited on Feb 4

Commit

62f98bb

1 Parent(s): cb14a1d

Heavy & Accurate: Integrated SpeechBrain VAD + MMS-300M pipeline

Browse files

Files changed (6) hide show

.gitignore +2 -0
README.md +4 -3
app/infer.py +84 -13
requirements.txt +6 -4
verify_model.py +11 -24
verify_speechbrain.py +50 -0

.gitignore CHANGED Viewed

@@ -25,3 +25,5 @@ temp_*
 test_audio.py
 verify_pipeline.py
 test_api.py

 test_audio.py
 verify_pipeline.py
 test_api.py
+test_vad.wav
+tmp_vad_model/

README.md CHANGED Viewed

@@ -17,9 +17,9 @@ Built for the **AI-Generated Voice Detection Challenge** with specific support f
 ## 🚀 Features
-- **Multilingual Support**: Uses the **XLS-R (Cross-Lingual Speech Representation)** model (`wav2vec2-large-xlsr-53`) pre-trained on 53 languages.
 - **Strict API Specification**: Compliant with challenge requirements (Base64 MP3 input, standardized JSON response).
-- **Hybrid Detection**: Combines Deep Learning embeddings with **Acoustic Feature Analysis** (Pitch Variance) for robust detection.
 - **Explainability**: Provides human-readable explanations for every decision.
 - **Secure**: Protected via `x-api-key` header authentication.
@@ -28,7 +28,8 @@ Built for the **AI-Generated Voice Detection Challenge** with specific support f
 ## 🛠️ Tech Stack
 - **Framework**: FastAPI (Python)
-- **Model**: PyTorch + HuggingFace Transformers (`facebook/wav2vec2-large-xlsr-53`)
 - **Audio Processing**: `pydub` (ffmpeg) + `librosa`
 - **Deployment**: Uvicorn

 ## 🚀 Features
+- **Multilingual Support**: Uses the state-of-the-art **MMS-300M (Massively Multilingual Speech)** model (`nii-yamagishilab/mms-300m-anti-deepfake`) derived from **XLS-R**, supporting 100+ languages including Indic languages.
 - **Strict API Specification**: Compliant with challenge requirements (Base64 MP3 input, standardized JSON response).
+- **Smart Hybrid Detection**: Combines Deep Learning embeddings with **Acoustic Heuristics** (Pitch, Flatness, Liveness) for "Conservative Consensus" detection.
 - **Explainability**: Provides human-readable explanations for every decision.
 - **Secure**: Protected via `x-api-key` header authentication.
 ## 🛠️ Tech Stack
 - **Framework**: FastAPI (Python)
+- **Model**: PyTorch + HuggingFace Transformers (`nii-yamagishilab/mms-300m-anti-deepfake`)
+- **Toolkit**: **SpeechBrain** (Environment ready for advanced audio processing)
 - **Audio Processing**: `pydub` (ffmpeg) + `librosa`
 - **Deployment**: Uvicorn

app/infer.py CHANGED Viewed

@@ -1,10 +1,13 @@
-import torch
-import torch.nn as nn
 import os
-import numpy as np
 import librosa
 import time
-from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
 from dotenv import load_dotenv
 load_dotenv()
@@ -30,6 +33,19 @@ class VoiceClassifier:
             print(f"Error loading model: {e}")
             self.model = None
     def calculate_snr(self, audio_np):
         """
         Estimate Signal-to-Noise Ratio (SNR) in dB.
@@ -54,6 +70,48 @@ class VoiceClassifier:
         except Exception:
             return 30.0 # Default to decent SNR if calculation fails
     def predict(self, waveform: torch.Tensor, language: str = "Unknown"):
         if self.model is None:
             return {"error": "Model not loaded"}
@@ -63,27 +121,39 @@ class VoiceClassifier:
             wav_np = waveform.squeeze().cpu().numpy()
             sr = 16000
             t0 = time.time()
-            # --- SIGNAL QUALITY CHECKS ---
             snr_db = self.calculate_snr(wav_np)
-            # --- ADVANCED FEATURE EXTRACTION ---
             # A. Pitch Analysis
             f0, voiced_flag, voiced_probs = librosa.pyin(
-                wav_np, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr
             )
             f0_clean = f0[~np.isnan(f0)]
             pitch_var = np.std(f0_clean) if len(f0_clean) > 0 else 0.0
             # B. Spectral Flatness
-            flatness = np.mean(librosa.feature.spectral_flatness(y=wav_np))
             # C. RMS Energy Variance
-            rms = librosa.feature.rms(y=wav_np)[0]
             rms_var = np.std(rms) / (np.mean(rms) + 1e-6)
-            # D. Liveness (Pause) Detection
             # Count distinct silent intervals (>0.1s)
             silent_intervals = librosa.effects.split(wav_np, top_db=20, frame_length=2048, hop_length=512)
             num_pauses = 0
@@ -95,12 +165,13 @@ class VoiceClassifier:
                          num_pauses += 1
             # --- TEMPORAL CONSISTENCY ---
             chunk_size = 2 * sr
             stride = 1 * sr
             chunks = []
-            for i in range(0, len(wav_np) - chunk_size + 1, stride):
-                chunks.append(wav_np[i : i + chunk_size])
-            if not chunks: chunks = [wav_np]
             chunk_probs = []
             for chunk in chunks:

 import os
+import torch
+import torchaudio
 import librosa
+import numpy as np
 import time
+import shutil
+from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
+from speechbrain.inference.VAD import VAD
+import soundfile as sf
 from dotenv import load_dotenv
 load_dotenv()
             print(f"Error loading model: {e}")
             self.model = None
+        # Load SpeechBrain VAD
+        try:
+            print("Loading SpeechBrain VAD...")
+            self.vad_model = VAD.from_hparams(
+                source="speechbrain/vad-crdnn-libriparty",
+                savedir="tmp_vad_model",
+                run_opts={"device": str(self.device)}
+            )
+            print("SpeechBrain VAD loaded.")
+        except Exception as e:
+            print(f"Error loading VAD: {e}")
+            self.vad_model = None
     def calculate_snr(self, audio_np):
         """
         Estimate Signal-to-Noise Ratio (SNR) in dB.
         except Exception:
             return 30.0 # Default to decent SNR if calculation fails
+    def apply_vad(self, wav_path):
+        """
+        Apply VAD to filter out silence/noise.
+        Returns cleaned waveform (numpy) or original if failed/empty.
+        """
+        if self.vad_model is None:
+            return None
+        try:
+            # Get speech segments
+            boundaries = self.vad_model.get_speech_segments(wav_path)
+            # If tensor, convert to list
+            if isinstance(boundaries, torch.Tensor):
+                boundaries = boundaries.cpu().numpy()
+            # Load original audio
+            wav, sr = librosa.load(wav_path, sr=16000)
+            if len(boundaries) == 0:
+                print("DEBUG: VAD found no speech. Using original.")
+                return wav
+            # Concatenate segments
+            cleaned_wavs = []
+            for start, end in boundaries:
+                start_sample = int(start * sr)
+                end_sample = int(end * sr)
+                if end_sample > len(wav): end_sample = len(wav)
+                cleaned_wavs.append(wav[start_sample:end_sample])
+            if not cleaned_wavs:
+                return wav
+            final_wav = np.concatenate(cleaned_wavs)
+            print(f"DEBUG: VAD reduced audio from {len(wav)/sr:.2f}s to {len(final_wav)/sr:.2f}s")
+            return final_wav
+        except Exception as e:
+            print(f"VAD Error: {e}")
+            return None
     def predict(self, waveform: torch.Tensor, language: str = "Unknown"):
         if self.model is None:
             return {"error": "Model not loaded"}
             wav_np = waveform.squeeze().cpu().numpy()
             sr = 16000
+            # Save to temp file for VAD (SpeechBrain prefers files)
+            tmp_file = "temp_vad_input.wav"
+            sf.write(tmp_file, wav_np, sr)
+            # --- STAGE 1: SPEECHBRAIN VAD ---
             t0 = time.time()
+            vad_wav = self.apply_vad(tmp_file)
+            # Use VAD audio if valid and not too short, else original
+            if vad_wav is not None and len(vad_wav) > sr * 0.5:
+                wav_for_analysis = vad_wav
+            else:
+                wav_for_analysis = wav_np
+            # Signal Quality Checks (on original to capture noise floor)
             snr_db = self.calculate_snr(wav_np)
+            # --- ADVANCED FEATURE EXTRACTION (on VAD audio) ---
             # A. Pitch Analysis
             f0, voiced_flag, voiced_probs = librosa.pyin(
+                wav_for_analysis, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr
             )
             f0_clean = f0[~np.isnan(f0)]
             pitch_var = np.std(f0_clean) if len(f0_clean) > 0 else 0.0
             # B. Spectral Flatness
+            flatness = np.mean(librosa.feature.spectral_flatness(y=wav_for_analysis))
             # C. RMS Energy Variance
+            rms = librosa.feature.rms(y=wav_for_analysis)[0]
             rms_var = np.std(rms) / (np.mean(rms) + 1e-6)
+            # D. Liveness (Pause) Detection (Use original to detect gaps)
             # Count distinct silent intervals (>0.1s)
             silent_intervals = librosa.effects.split(wav_np, top_db=20, frame_length=2048, hop_length=512)
             num_pauses = 0
                          num_pauses += 1
             # --- TEMPORAL CONSISTENCY ---
+            # Use VAD audio for Deepfake Classification
             chunk_size = 2 * sr
             stride = 1 * sr
             chunks = []
+            for i in range(0, len(wav_for_analysis) - chunk_size + 1, stride):
+                chunks.append(wav_for_analysis[i : i + chunk_size])
+            if not chunks: chunks = [wav_for_analysis]
             chunk_probs = []
             for chunk in chunks:

requirements.txt CHANGED Viewed

@@ -1,13 +1,15 @@
-fastapi
 uvicorn
 python-dotenv
-torch
-torchaudio
 librosa
-numpy
 python-multipart
 python-jose[cryptography]
 passlib[bcrypt]
 transformers
 pydub
 scipy

 uvicorn
 python-dotenv
+torch<2.1.0
+torchaudio<2.1.0
 librosa
+numpy<2.0.0
 python-multipart
 python-jose[cryptography]
 passlib[bcrypt]
 transformers
 pydub
 scipy
+speechbrain
+huggingface_hub<0.20.0
+soundfile

verify_model.py CHANGED Viewed

@@ -1,33 +1,20 @@
 import torch
-from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
 import numpy as np
-def verify_model():
-    model_name = "mo-thecreator/Deepfake-audio-detection"
-    print(f"Loading {model_name}...")
     try:
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
         model = AutoModelForAudioClassification.from_pretrained(model_name)
-        print("Model loaded successfully!")
-        print("Labels:", model.config.id2label)
-        # Create dummy audio (1 second of silence/noise)
-        # 16000 Hz
-        dummy_audio = np.random.uniform(-1, 1, 16000)
-        inputs = feature_extractor(dummy_audio, sampling_rate=16000, return_tensors="pt")
-        with torch.no_grad():
-            logits = model(**inputs).logits
-        print("Logits:", logits)
-        predicted_class_id = torch.argmax(logits, dim=-1).item()
-        print("Predicted Label:", model.config.id2label[predicted_class_id])
     except Exception as e:
-        print(f"Failed to load/run model: {e}")
 if __name__ == "__main__":
-    verify_model()

 import torch
+from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
 import numpy as np
+def check_model():
+    model_name = "nii-yamagishilab/mms-300m-anti-deepfake"
+    feature_extractor_name = "facebook/mms-300m"
+    print(f"Verifying load for: {model_name}")
     try:
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(feature_extractor_name)
         model = AutoModelForAudioClassification.from_pretrained(model_name)
+        print("Success! Model and Extractor loaded.")
+        print(f"Classes: {model.config.id2label}")
     except Exception as e:
+        print(f"Failed: {e}")
 if __name__ == "__main__":
+    check_model()

verify_speechbrain.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torchaudio
+import numpy as np
+from speechbrain.inference.VAD import VAD
+def verify_vad():
+    model_source = "speechbrain/vad-crdnn-libriparty"
+    print(f"Loading VAD model: {model_source}...")
+    try:
+        # Load VAD
+        vad_model = VAD.from_hparams(
+            source=model_source,
+            savedir="tmp_vad_model",
+            run_opts={"device": "cpu"} # Force CPU for verification
+        )
+        print("VAD Model loaded successfully!")
+        # Create dummy audio (random noise + silence + random noise)
+        sr = 16000
+        duration = 5 # seconds
+        t = np.linspace(0, duration, int(sr * duration))
+        # 1 sec noise, 2 sec silence, 2 sec noise
+        audio = np.random.uniform(-0.1, 0.1, int(sr * 1))
+        audio = np.concatenate([audio, np.zeros(int(sr * 2))])
+        audio = np.concatenate([audio, np.random.uniform(-0.1, 0.1, int(sr * 2))])
+        # Convert to tensor path not needed if we can process tensor
+        # SpeechBrain VAD usually expects a file path, but let's check input flexibility
+        # For this test, save to a temp file
+        import soundfile as sf
+        sf.write('test_vad.wav', audio, sr)
+        print("Processing test_vad.wav...")
+        # Boundaries usually returns a tensor of [start, end]
+        boundaries = vad_model.get_speech_segments("test_vad.wav")
+        print(f"Speech Segments found: \n{boundaries}")
+        # Check if it filtered the silence
+        print("Verification complete.")
+    except Exception as e:
+        print(f"Error: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    verify_vad()