File size: 4,866 Bytes
c5c9261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

import torch
import torch.nn.functional as F
import numpy as np
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
from app.config import settings
import logging
import gc

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class VoiceDetector:
    _instance = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(VoiceDetector, cls).__new__(cls)
            cls._instance.model = None
            cls._instance.feature_extractor = None
            # Force CPU to save memory on free tier
            cls._instance.device = "cpu"
            cls._instance.load_model()
        return cls._instance

    def load_model(self):
        try:
            logger.info(f"Loading model {settings.MODEL_NAME} on {self.device}...")
            
            # Clear memory before loading
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            # Load with memory optimization
            self.feature_extractor = AutoFeatureExtractor.from_pretrained(
                settings.MODEL_NAME
            )
            self.model = AutoModelForAudioClassification.from_pretrained(
                settings.MODEL_NAME,
                low_cpu_mem_usage=True,  # Memory optimization
                torch_dtype=torch.float32
            )
            self.model.to(self.device)
            self.model.eval()
            
            # Clear unused memory
            gc.collect()
            
            logger.info("Model loaded successfully.")
        except Exception as e:
            logger.error(f"Failed to load model: {e}")
            raise RuntimeError(f"Failed to load model: {e}")

    def calibrate_confidence(self, probs, temperature=1.5):
        """

        Apply temperature scaling to calibrate confidence scores.

        This makes the model less overconfident and more reliable.

        

        Temperature > 1.0 makes predictions less confident (more realistic)

        Temperature < 1.0 makes predictions more confident

        """
        # Apply temperature scaling to logits before softmax
        logits = torch.log(probs + 1e-10)  # Convert back to logits
        scaled_logits = logits / temperature
        calibrated_probs = F.softmax(scaled_logits, dim=-1)
        return calibrated_probs
    

    def predict(self, audio_array):
        """

        Refined prediction for stability.

        """
        if self.model is None:
            self.load_model()
            
        try:
            # Prepare input
            inputs = self.feature_extractor(
                audio_array, 
                sampling_rate=settings.SAMPLE_RATE, 
                return_tensors="pt", 
                padding=True
            )
            
            inputs = {key: val.to(self.device) for key, val in inputs.items()}

            # Inference
            with torch.no_grad():
                logits = self.model(**inputs).logits
            
            # Use raw softmax for the base confidence
            probs = F.softmax(logits, dim=-1)
            
            # Get model labels from config
            id2label = self.model.config.id2label
            
            # Get the predicted class index
            pred_idx = torch.argmax(probs, dim=-1).item()
            label = str(id2label[pred_idx]).lower()
            confidence = probs[0][pred_idx].item()
            
            logger.info(f"Model Raw Output: Index={pred_idx}, Label={label}, Confidence={confidence:.4f}")
            
            # Robust Mapping Logic
            # mo-thecreator/Deepfake-audio-detection usually uses:
            # 0 -> REAL, 1 -> FAKE
            
            is_ai = False
            if "fake" in label or "spoof" in label:
                is_ai = True
            elif "real" in label or "bonafide" in label:
                is_ai = False
            else:
                # Direct index mapping fallback (very safe for this specific model)
                if pred_idx == 1:
                    is_ai = True
                else:
                    is_ai = False
            
            result_label = "AI_GENERATED" if is_ai else "HUMAN"
            
            # Stability check: If confidence is too low (< 0.6), 
            # the model is essentially guessing.
            if confidence < 0.6:
                logger.info(f"Low confidence ({confidence:.4f}) detected. Result might be uncertain.")

            return result_label, confidence

        except Exception as e:
            logger.error(f"Prediction error: {e}")
            raise RuntimeError(f"Prediction failed: {e}")

voice_detector = VoiceDetector()