import torch import torch.nn.functional as F import numpy as np from transformers import AutoFeatureExtractor, AutoModelForAudioClassification from app.config import settings from app.core.forensics import forensic_engine from app.core.audio import segment_audio import logging import gc import time # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class VoiceDetector: """ World-class voice detection engine. Combines neural model inference with forensic analysis for maximum accuracy. """ _instance = None def __new__(cls): if cls._instance is None: cls._instance = super(VoiceDetector, cls).__new__(cls) cls._instance.model = None cls._instance.feature_extractor = None cls._instance.device = "cpu" cls._instance.load_model() return cls._instance def load_model(self): try: logger.info(f"Loading model {settings.MODEL_NAME} on {self.device}...") gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() self.feature_extractor = AutoFeatureExtractor.from_pretrained( settings.MODEL_NAME ) self.model = AutoModelForAudioClassification.from_pretrained( settings.MODEL_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32 ) self.model.to(self.device) self.model.eval() gc.collect() logger.info("Model loaded successfully.") except Exception as e: logger.error(f"Failed to load model: {e}") raise RuntimeError(f"Failed to load model: {e}") def _infer_single(self, audio_array: np.ndarray) -> tuple: """Run model inference on a single audio segment.""" inputs = self.feature_extractor( audio_array, sampling_rate=settings.SAMPLE_RATE, return_tensors="pt", padding=True ) inputs = {key: val.to(self.device) for key, val in inputs.items()} with torch.no_grad(): logits = self.model(**inputs).logits probs = F.softmax(logits, dim=-1) pred_idx = torch.argmax(probs, dim=-1).item() confidence = probs[0][pred_idx].item() # Get model label id2label = self.model.config.id2label label = str(id2label[pred_idx]).lower() # Map to binary: is it AI? is_ai = False if "fake" in label or "spoof" in label: is_ai = True elif "real" in label or "bonafide" in label: is_ai = False else: is_ai = (pred_idx == 1) # Return P(AI) score (0=human, 1=AI) if is_ai: ai_score = confidence else: ai_score = 1.0 - confidence return ai_score, confidence, is_ai def predict(self, audio_array: np.ndarray, audio_profile: dict = None, detailed: bool = False) -> dict: """ Full detection pipeline: 1. Multi-segment neural model inference 2. Forensic analysis (spectral, temporal, formant, artifact) 3. Score fusion for final verdict Returns a rich result dict. """ if self.model is None: self.load_model() start_time = time.time() try: sr = settings.SAMPLE_RATE # ====== Stage 1: Multi-Segment Neural Inference ====== # Optimization: No overlap, max 3 segments (first 15s is substantial for detection) segments = segment_audio(audio_array, sr, segment_sec=5.0, overlap_sec=0.0) if len(segments) > 3: segments = segments[:3] segment_scores = [] for seg in segments: ai_score, conf, is_ai = self._infer_single(seg) segment_scores.append(ai_score) # Aggregate: use mean neural_score = float(np.mean(segment_scores)) neural_confidence = max(neural_score, 1.0 - neural_score) neural_verdict = "AI_GENERATED" if neural_score >= 0.5 else "HUMAN" logger.info( f"Neural: {neural_verdict} (score={neural_score:.4f}, " f"segments={len(segments)}, per-seg={[round(s, 3) for s in segment_scores]})" ) # ====== Stage 2: Forensic Analysis ====== # Optimization: Skip forensics if model is extremely confident (> 99%) # This saves ~1-1.5s of processing time for clear-cut cases. SKIP_FORENSICS_THRESHOLD = 0.99 if neural_confidence > SKIP_FORENSICS_THRESHOLD: logger.info(f"Skipping forensics (neural confidence {neural_confidence:.4f} > {SKIP_FORENSICS_THRESHOLD})") forensic_score = neural_score # Assume agreement all_artifacts = [] forensic_results = {} fused_score = neural_score # No fusion, trust neural # Logic for "Analyzers agree" mock agreement = True final_verdict = neural_verdict final_confidence = neural_confidence else: forensic_results = forensic_engine.analyze(audio_array, sr) forensic_score = forensic_engine.compute_forensic_score(forensic_results) all_artifacts = forensic_engine.get_all_artifacts(forensic_results) logger.info( f"Forensics: score={forensic_score:.4f}, " f"artifacts={len(all_artifacts)} found" ) # ====== Stage 3: Score Fusion ====== # Neural model gets higher weight (it's trained on actual data) # Forensics provide supporting evidence and catch edge cases NEURAL_WEIGHT = 0.75 FORENSIC_WEIGHT = 0.25 fused_score = (neural_score * NEURAL_WEIGHT) + (forensic_score * FORENSIC_WEIGHT) # Boost confidence if neural and forensics agree neural_says_ai = neural_score >= 0.5 forensic_says_ai = forensic_score >= 0.4 agreement = (neural_says_ai == forensic_says_ai) if agreement: # Both agree → push score further from 0.5 fused_score = fused_score * 1.1 if fused_score >= 0.5 else fused_score * 0.9 fused_score = max(0.0, min(1.0, fused_score)) # Final verdict final_verdict = "AI_GENERATED" if fused_score >= 0.5 else "HUMAN" if final_verdict == "AI_GENERATED": # Boost AI confidence per user request boosted_score = fused_score + 0.18 # Cap at 0.94 fused_score = min(0.94, boosted_score) final_confidence = fused_score else: final_confidence = 1.0 - fused_score # Ensure minimum confidence floor final_confidence = max(final_confidence, 0.51) inference_time = round((time.time() - start_time) * 1000, 1) logger.info( f"FINAL: {final_verdict} (confidence={final_confidence:.4f}, " f"fused={fused_score:.4f}, neural={neural_score:.4f}, " f"forensic={forensic_score:.4f}, time={inference_time}ms)" ) # ====== Build Response ====== result = { "classification": final_verdict, "confidence": round(final_confidence, 4), "fused_score": round(fused_score, 4), "inference_time_ms": inference_time, "analyzers_agree": agreement, } if detailed: result["forensics"] = { "neural_model": { "score": round(neural_score, 4), "verdict": neural_verdict, "segments_analyzed": len(segments), "per_segment_scores": [round(s, 4) for s in segment_scores], }, **forensic_results, } result["artifacts_summary"] = all_artifacts if audio_profile: result["audio_profile"] = audio_profile return result except Exception as e: logger.error(f"Prediction error: {e}") raise RuntimeError(f"Prediction failed: {e}") voice_detector = VoiceDetector()