Spaces:

testing-ak
/

voice-detection-v2

Sleeping

File size: 5,050 Bytes

import torch
import librosa
import numpy as np
import io
import torch.nn.functional as F
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor

class VoiceDetector:
    def __init__(self):
        print("⏳ Loading Robust AI Detection Model...")
        # FIX: Using the verified MelodyMachine model
        self.model_name = "MelodyMachine/Deepfake-audio-detection-V2"
        
        try:
            self.feature_extractor = AutoFeatureExtractor.from_pretrained(self.model_name)
            self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
            self.model.eval()
            
            # Print labels to debug (Make sure we know what 0 and 1 mean)
            print(f"✅ Model Labels: {self.model.config.id2label}")
            
        except Exception as e:
            print(f"❌ CRITICAL ERROR: Failed to load AI model. {e}")
            raise e

    def preprocess_audio(self, audio_buffer: io.BytesIO, target_sr=16000):
        """
        Robust preprocessing: Resample, Normalize, and Fix Duration.
        """
        audio_buffer.seek(0)
        y, sr = librosa.load(audio_buffer, sr=target_sr)
        
        # 1. Normalize Volume (Crucial for quiet clips)
        y = librosa.util.normalize(y)
        
        # 2. Fix Duration (Model expects ~3-5 seconds)
        # If too short (< 1.5s), loop it.
        if len(y) < target_sr * 1.5:
            tile_factor = int_(np.ceil((target_sr * 1.5) / len(y)))
            y = np.tile(y, tile_factor)
            
        # 3. Limit Duration (If > 10s, take the middle 5s)
        # Long files confuse the model logic if not chunked.
        max_len = target_sr * 5
        if len(y) > max_len:
            start = (len(y) - max_len) // 2
            y = y[start : start + max_len]
            
        return y

    def analyze(self, audio_buffer: io.BytesIO, language: str):
        """
        Analyzes audio with improved threshold logic using MelodyMachine.
        """
        try:
            # 1. Preprocess
            audio_input = self.preprocess_audio(audio_buffer)
            
            # 2. Prepare Input
            inputs = self.feature_extractor(
                audio_input, 
                sampling_rate=16000, 
                return_tensors="pt", 
                padding=True
            )
            
            # 3. Inference
            with torch.no_grad():
                logits = self.model(**inputs).logits
            
            # 4. Get Probabilities
            probs = F.softmax(logits, dim=-1)
            
            # 5. Dynamic Label Mapping (Safe method)
            # MelodyMachine usually: Label 0 = Fake, Label 1 = Real
            # But we check config to be 100% sure.
            id2label = self.model.config.id2label
            fake_score = 0.0
            real_score = 0.0
            
            # Find which index is 'fake' and which is 'real'
            for idx, label in id2label.items():
                label_lower = str(label).lower()
                if "fake" in label_lower or "spoof" in label_lower:
                    fake_score = probs[0][idx].item()
                elif "real" in label_lower or "bonafide" in label_lower:
                    real_score = probs[0][idx].item()
            
            # Fallback if labels are just "LABEL_0", "LABEL_1" (MelodyMachine default)
            # In MelodyMachine: LABEL_0 is REAL, LABEL_1 is FAKE (Wait, let's verify logic below)
            # Actually, standard MelodyMachine: 
            # Index 1 (Deepfake) -> "fake"
            # Index 0 (Real) -> "real"
            # Let's use direct index access if names are generic
            if fake_score == 0.0 and real_score == 0.0:
                 # Default mapping for MelodyMachine
                 real_score = probs[0][0].item() # Real is usually 0
                 fake_score = probs[0][1].item() # Fake is usually 1
            
            print(f"🔍 DEBUG: Real Score: {real_score:.4f} | Fake Score: {fake_score:.4f}")

            # 6. Decision Logic
            # If AI confidence is > 50%, call it AI.
            if fake_score > real_score:
                classification = "AI_GENERATED"
                confidence = fake_score
                explanation = f"Detected synthetic artifacts with {int(fake_score*100)}% confidence."
            else:
                classification = "HUMAN"
                confidence = real_score
                explanation = f"Verified human vocal characteristics with {int(real_score*100)}% confidence."

            return {
                "classification": classification,
                "confidenceScore": round(confidence, 2),
                "explanation": explanation
            }

        except Exception as e:
            print(f"Analysis Error: {e}")
            return {
                "classification": "HUMAN",
                "confidenceScore": 0.0,
                "explanation": f"Error: {str(e)}"
            }

# Helper to fix numpy integer issue
def int_(val):
    return int(val)