Spaces:

Hariharan00
/

voice-detection-api

Sleeping

App Files Files Community

Hariharan S commited on Feb 5

Commit

488006a

1 Parent(s): 0cd5695

Upgrade to SOTA Wav2Vec2 deepfake detector

Browse files

Files changed (4) hide show

app/main.py +13 -0
ml/inference.py +36 -13
ml/sota_model.py +86 -0
requirements.txt +3 -1

app/main.py CHANGED Viewed

@@ -26,6 +26,19 @@ app = FastAPI(
     version="1.0.0"
 )
 # CORS configuration
 app.add_middleware(
     CORSMiddleware,

     version="1.0.0"
 )
+# Startup Event to Preload Model
+@app.on_event("startup")
+async def startup_event():
+    """Preload SOTA model on startup to avoid first-request latency"""
+    try:
+        logger.info("Initializing SOTA Deepfake Detector...")
+        # Import inside function to avoid top-level overhead if imports fail
+        from ml.sota_model import get_detector
+        get_detector() # Triggers model loading
+        logger.info("SOTA Model preloaded successfully!")
+    except Exception as e:
+        logger.warning(f"Could not preload SOTA model: {e}")
 # CORS configuration
 app.add_middleware(
     CORSMiddleware,

ml/inference.py CHANGED Viewed

@@ -149,9 +149,18 @@ def heuristic_fallback(features):
     # Clamp to valid range
     return max(0.01, min(0.99, ai_score))
 async def predict_voice_authenticity(audio_base64: str, language: str) -> Dict:
     """
-    Main inference pipeline
     """
     temp_path = f"/tmp/{uuid.uuid4()}.mp3"
@@ -165,23 +174,37 @@ async def predict_voice_authenticity(audio_base64: str, language: str) -> Dict:
             logger.error(f"Base64 decode failed: {e}")
             raise ValueError("Invalid Base64 audio string")
-        # 2. Extract features
         features = extract_audio_features(temp_path)
-        # 3. Clean up
         if os.path.exists(temp_path):
             os.remove(temp_path)
-        # 4. Load model
-        classifier = load_model()
-        # 5. Run inference - Use heuristics for better modern AI voice detection
-        # The heuristics are calibrated for Canva, ElevenLabs, etc.
-        ai_probability = heuristic_fallback(features)
         # 6. Interpret results
-        classification = "AI_GENERATED" if ai_probability > 0.5 else "HUMAN"
-        confidence = ai_probability if ai_probability > 0.5 else (1 - ai_probability)
         # 7. Generate explanation
         explanation = generate_explanation(features, ai_probability)
@@ -198,4 +221,4 @@ async def predict_voice_authenticity(audio_base64: str, language: str) -> Dict:
         if os.path.exists(temp_path):
             os.remove(temp_path)
         logger.error(f"Prediction error: {e}")
-        raise ValueError(f"Audio processing/feature extraction error: {str(e)}")

     # Clamp to valid range
     return max(0.01, min(0.99, ai_score))
+# Import SOTA model
+try:
+    from ml.sota_model import get_detector
+    HAS_SOTA = True
+except ImportError as e:
+    logging.warning(f"Could not import SOTA model: {e}")
+    HAS_SOTA = False
 async def predict_voice_authenticity(audio_base64: str, language: str) -> Dict:
     """
+    Main inference pipeline using SOTA Deep Learning model
     """
     temp_path = f"/tmp/{uuid.uuid4()}.mp3"
             logger.error(f"Base64 decode failed: {e}")
             raise ValueError("Invalid Base64 audio string")
+        # 2. Extract features (still useful for explanation)
         features = extract_audio_features(temp_path)
+        # 3. Predict using SOTA Model
+        ai_probability = None
+        used_method = "SOTA"
+        if HAS_SOTA:
+            detector = get_detector()
+            ai_probability = detector.predict(temp_path)
+        # 4. Fallback to heuristics if SOTA fails
+        if ai_probability is None:
+            logger.warning("SOTA model unavailable/failed, falling back to heuristics")
+            ai_probability = heuristic_fallback(features)
+            used_method = "HEURISTIC"
+        # 5. Clean up
         if os.path.exists(temp_path):
             os.remove(temp_path)
         # 6. Interpret results
+        # Threshold can be tuned. SOTA models are usually very confident.
+        if ai_probability > 0.5:
+            classification = "AI_GENERATED"
+            confidence = ai_probability
+        else:
+            classification = "HUMAN"
+            confidence = 1.0 - ai_probability
+        logger.info(f"Method: {used_method}, Prob: {ai_probability:.4f}, Class: {classification}")
         # 7. Generate explanation
         explanation = generate_explanation(features, ai_probability)
         if os.path.exists(temp_path):
             os.remove(temp_path)
         logger.error(f"Prediction error: {e}")
+        raise ValueError(f"Audio processing error: {str(e)}")

ml/sota_model.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+import torch.nn.functional as F
+import torchaudio
+from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
+import logging
+import os
+import shutil
+logger = logging.getLogger(__name__)
+class DeepfakeDetector:
+    def __init__(self, model_name="hemgg/Deepfake-audio-detection"):
+        """
+        Initialize the SOTA Deepfake Detector model.
+        Uses a pre-trained Wav2Vec2 model fine-tuned for deepfake detection.
+        """
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info(f"Loading SOTA model: {model_name} on {self.device}...")
+        try:
+            self.model = AutoModelForAudioClassification.from_pretrained(model_name).to(self.device).eval()
+            self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
+            self.loaded = True
+            logger.info("SOTA Model loaded successfully!")
+        except Exception as e:
+            logger.error(f"Failed to load SOTA model: {e}")
+            self.loaded = False
+    def predict(self, audio_path):
+        """
+        Predict if audio is AI-generated (Fake) or Human (Real).
+        Returns: probability of being AI (0.0 to 1.0)
+        """
+        if not self.loaded:
+            logger.warning("SOTA model not loaded, returning None")
+            return None
+        try:
+            # Load and resample audio using librosa (more robust backend)
+            import librosa
+            import numpy as np
+            # Load directly at 16kHz
+            waveform, sample_rate = librosa.load(audio_path, sr=16000)
+            # Ensure proper shape for transformers (1, length)
+            # librosa returns (length,) for mono
+            waveform = torch.tensor(waveform).unsqueeze(0)
+            # Input is now a tensor of shape (1, L)
+            # feature_extractor expects numpy array or tensor
+            input_values = self.feature_extractor(
+                waveform.squeeze().numpy(),
+                return_tensors="pt",
+                sampling_rate=16000
+            ).input_values.to(self.device)
+            with torch.no_grad():
+                logits = self.model(input_values).logits
+            # The model outputs [Real_Logit, Fake_Logit] usually
+            # Let's check the config label map if possible, but hemgg/Deepfake-audio-detection
+            # typically maps 0: Real, 1: Fake or vice-versa.
+            # hemgg/Deepfake-audio-detection labels: {0: 'real', 1: 'fake'}
+            probs = F.softmax(logits, dim=-1)
+            # labels: {0: 'AIVoice', 1: 'HumanVoice'}
+            fake_prob = probs[0][0].item() # Index 0 is 'AIVoice'
+            logger.info(f"SOTA Prediction - Fake Prob: {fake_prob:.4f}")
+            return fake_prob
+        except Exception as e:
+            logger.error(f"SOTA prediction failed: {e}")
+            return None
+# Singleton instance
+_detector = None
+def get_detector():
+    global _detector
+    if _detector is None:
+        _detector = DeepfakeDetector()
+    return _detector

requirements.txt CHANGED Viewed

@@ -5,11 +5,13 @@ pydantic==2.5.3
 python-multipart==0.0.6
 # ML & Audio Processing
-torch==2.1.2
 librosa==0.10.1
 soundfile==0.12.1
 numpy==1.26.3
 scipy>=1.10.0
 scikit-learn==1.4.0
 # Utilities

 python-multipart==0.0.6
 # ML & Audio Processing
+torch>=2.2.0
+torchaudio>=2.2.0
 librosa==0.10.1
 soundfile==0.12.1
 numpy==1.26.3
 scipy>=1.10.0
+transformers>=4.35.0  # For pre-trained deepfake models
 scikit-learn==1.4.0
 # Utilities