Spaces:

Krish1440
/

ai-voice-detector-api

Sleeping

File size: 6,764 Bytes

60d70d0

import io
import librosa
import numpy as np
import soundfile as sf
import torch
from transformers import pipeline

class AudioDetector:
    def __init__(self):
        print("--- [AudioDetector] Initializing 4-Model Ensemble System... ---")
        
        # The Committee of Experts
        self.models_config = [
            {
                "id": "MelodyMachine/Deepfake-audio-detection-V2", 
                "name": "MelodyMachine",
                "weight": 1.0
            },
            {
                "id": "mo-thecreator/Deepfake-audio-detection", 
                "name": "Mo-Creator",
                "weight": 1.0
            },
            {
                "id": "Hemgg/Deepfake-audio-detection", 
                "name": "Hemgg",
                "weight": 1.0
            },
            {
                "id": "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification", 
                "name": "Gustking-XLSR",
                "weight": 1.2 # Higher weight for the large model
            }
        ]
        
        self.pipelines = []
        
        for cfg in self.models_config:
            try:
                print(f"--- Loading Model: {cfg['name']} ({cfg['id']}) ---")
                # Load pipeline
                p = pipeline("audio-classification", model=cfg['id'])
                self.pipelines.append({"pipe": p, "config": cfg})
                print(f"[+] Loaded {cfg['name']}")
            except Exception as e:
                print(f"[-] Failed to load {cfg['name']}: {e}")
        
        if not self.pipelines:
            print("CRITICAL: No models could be loaded. Ensemble is empty.")
    
    def analyze_audio(self, audio_data: bytes, language: str):
        try:
            # 1. Load Audio
            buffer = io.BytesIO(audio_data)
            y, sr = librosa.load(buffer, sr=16000)
            
            # 2. Extract Features (For Explanation Context Only)
            # We preserve this for generating professional justifications, 
            # but the DECISION is purely model-based.
            centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
            
            # 3. Running The Ensemble
            votes = []
            total_score = 0
            total_weight = 0
            
            print(f"\n--- Running Ensemble Inference on {len(self.pipelines)} models ---")
            
            for item in self.pipelines:
                p = item['pipe']
                cfg = item['config']
                weight = cfg['weight']
                
                try:
                    # Run Inference
                    results = p(y, top_k=None) # Get all labels
                    
                    # Parsing Result for AI Probability
                    ai_score = 0.0
                    
                    # Logic: Find the label that means "Fake"
                    ai_labels = ["fake", "spoof", "aivoice", "artificial", "generated"]
                    
                    found = False
                    for r in results:
                        label_clean = r['label'].lower().strip()
                        if label_clean in ai_labels:
                            ai_score = r['score']
                            found = True
                            break
                            
                    # Note: If no AI label is found (e.g. only 'real'/'human'), ai_score stays 0.0 (Human)
                    # This logic covers {0: 'real', 1: 'fake'} where 'fake' is present.
                    
                    verdict = "AI" if ai_score > 0.5 else "HUMAN"
                    
                    # Weighted contribution
                    votes.append({
                        "name": cfg['name'],
                        "ai_prob": ai_score,
                        "verdict": verdict
                    })
                    
                    total_score += (ai_score * weight)
                    total_weight += weight
                    
                    print(f" > {cfg['name']}: {ai_score:.4f} ({verdict})")
                    
                except Exception as e:
                    print(f"Error inferencing {cfg['name']}: {e}")
            
            # 4. Final Aggregation
            if total_weight > 0:
                final_ensemble_score = total_score / total_weight
            else:
                final_ensemble_score = 0.0 # Fail safe
                
            is_ai = final_ensemble_score > 0.5
            final_classification = "AI_GENERATED" if is_ai else "HUMAN"
            
            # Confidence Score: Distance from 0.5, normalized to 0.5-1.0 roughly, 
            # or just probability of the winning class.
            class_confidence = final_ensemble_score if is_ai else (1.0 - final_ensemble_score)
            
            print(f"--- Final Ensemble Score: {final_ensemble_score:.4f} => {final_classification} (Conf: {class_confidence:.2f}) ---\n")

            # 5. Construct Explanation
            # "3 out of 4 models detected deepfake artifacts..."
            ai_votes_count = sum(1 for v in votes if v['verdict'] == 'AI')
            total_models = len(votes)
            
            explanations = []
            explanations.append(f"Ensemble Analysis: {ai_votes_count}/{total_models} models flagged this audio as AI-generated.")
            explanations.append(f"Aggregated Score: {final_ensemble_score*100:.1f}%.")
            
            if is_ai:
                 if centroid > 2000:
                     explanations.append("High-frequency spectral artifacts consistent with neural vocoders detected.")
                 else:
                     explanations.append("Deep learning pattern matching identified non-biological features.")
            else:
                 explanations.append("Acoustic analysis confirms natural vocal resonance and organic production.")
            
            final_explanation = " ".join(explanations)

            return {
                "classification": final_classification,
                # Return logical confidence (prob of the chosen class)
                "confidenceScore": round(float(class_confidence), 2),
                "explanation": final_explanation
            }
            
        except Exception as e:
            print(f"Analysis Failed: {e}")
            return {
                "classification": "HUMAN", # Fail safe
                "confidenceScore": 0.0,
                "error": str(e),
                "explanation": "Analysis failed due to internal error."
            }

# Global Instance
detector = AudioDetector()