Spaces:

S-Vetrivel
/

VoiceGuard-API

Sleeping

App Files Files Community

S-Vetrivel commited on Feb 4

Commit

04f3da2

1 Parent(s): 9d2870b

Implement supreme Hybrid AI detection: sliding window, spectral and temporal features, and language-aware confidence.

Browse files

Files changed (3) hide show

app/infer.py +79 -59
app/main.py +1 -1
test_full_audio.py +39 -0

app/infer.py CHANGED Viewed

@@ -29,89 +29,109 @@ class VoiceClassifier:
             print(f"Error loading model: {e}")
             self.model = None
-    def predict(self, waveform: torch.Tensor):
         if self.model is None:
             return {"error": "Model not loaded"}
         try:
             # 1. Preprocess Audio
-            # Waveform is [1, T] Tensor. Convert to numpy [T]
             wav_np = waveform.squeeze().cpu().numpy()
-            # Ensure we send it as a list or numpy array to the extractor
-            inputs = self.feature_extractor(
-                wav_np,
-                sampling_rate=16000,
-                return_tensors="pt",
-                padding=True
             )
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
-            t0 = time.time()
-            # 2. Model Inference
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-                logits = outputs.logits
-                probs = torch.softmax(logits, dim=-1)
-                # Logic for this specific model:
-                # Label 0: 'fake' (AI)
-                # Label 1: 'real' (Human)
-                prob_fake = probs[0][0].item()
-                prob_real = probs[0][1].item()
             t1 = time.time()
-            print(f"DEBUG: Inference took {t1 - t0:.3f}s. probs: {probs}")
-            print(f"DEBUG: prob_fake={prob_fake:.6f}, prob_real={prob_real:.6f}")
-            # 3. Pitch Analysis (for explanation)
-            # Use librosa for pitch tracking (fast approximation)
-            f0, voiced_flag, voiced_probs = librosa.pyin(
-                wav_np,
-                fmin=librosa.note_to_hz('C2'),
-                fmax=librosa.note_to_hz('C7'),
-                sr=16000,
-                frame_length=2048
-            )
-            f0 = f0[~np.isnan(f0)]
-            pitch_var = np.std(f0) if len(f0) > 0 else 0.0
-            t2 = time.time()
-            print(f"DEBUG: Pitch Detection took {t2 - t1:.3f}s. Variance: {pitch_var:.2f}")
-            # 4. Final Classification Logic
-            # Deepfake model is the authority
-            if prob_fake > prob_real:
-                prediction = "AI_GENERATED"
-                confidence = prob_fake
-                prob_ai = prob_fake
-            else:
-                prediction = "HUMAN"
-                confidence = prob_real
-                prob_ai = prob_fake
-            print(f"DEBUG: prediction={prediction}, confidence={confidence:.6f}, prob_ai={prob_ai:.6f}")
             # Construct Explanation
             if prediction == "AI_GENERATED":
-                if pitch_var < 20.0:
-                    explanation = f"Deepfake model reported {confidence*100:.1f}% confidence. Detected unnatural pitch consistency (Variance: {pitch_var:.1f})."
-                else:
-                    explanation = f"Deepfake model reported {confidence*100:.1f}% confidence. Detected digital artifacts characteristic of AI synthesis."
             else:
-                if pitch_var > 20.0:
-                    explanation = f"Deepfake model reported {confidence*100:.1f}% confidence. Natural prosody and high pitch variance detected."
                 else:
-                    explanation = f"Deepfake model reported {confidence*100:.1f}% confidence. Audio classified as human despite low pitch variance."
             return {
                 "prediction": prediction,
-                "probability_ai": float(f"{prob_ai:.4f}"),
                 "confidence": float(f"{confidence:.4f}"),
                 "features": {
-                    "pitch_variance": float(f"{pitch_var:.2f}")
                 },
                 "explanation": explanation
             }

             print(f"Error loading model: {e}")
             self.model = None
+    def predict(self, waveform: torch.Tensor, language: str = "Unknown"):
         if self.model is None:
             return {"error": "Model not loaded"}
         try:
             # 1. Preprocess Audio
             wav_np = waveform.squeeze().cpu().numpy()
+            sr = 16000
+            # --- ADVANCED FEATURE EXTRACTION ---
+            t0 = time.time()
+            # A. Pitch Analysis
+            f0, voiced_flag, voiced_probs = librosa.pyin(
+                wav_np, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr
             )
+            f0_clean = f0[~np.isnan(f0)]
+            pitch_var = np.std(f0_clean) if len(f0_clean) > 0 else 0.0
+            # B. Spectral Flatness (Detects vocoder buzz)
+            flatness = np.mean(librosa.feature.spectral_flatness(y=wav_np))
+            # C. RMS Energy Variance (Detects flattened volume envelopes)
+            rms = librosa.feature.rms(y=wav_np)[0]
+            rms_var = np.std(rms) / (np.mean(rms) + 1e-6) # Normalized variance
+            # D. Zero Crossing Rate Variance (Detects robotic vowel transitions)
+            zcr = librosa.feature.zero_crossing_rate(wav_np)[0]
+            zcr_var = np.std(zcr)
+            # --- TEMPORAL CONSISTENCY (SLIDING WINDOW) ---
+            chunk_size = 2 * sr # 2 seconds
+            stride = 1 * sr     # 1 second overlap
+            chunks = []
+            for i in range(0, len(wav_np) - chunk_size + 1, stride):
+                chunks.append(wav_np[i : i + chunk_size])
+            # If audio too short for stride, just use whole thing
+            if not chunks:
+                chunks = [wav_np]
+            chunk_probs = []
+            for chunk in chunks:
+                inputs = self.feature_extractor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                with torch.no_grad():
+                    outputs = self.model(**inputs)
+                    probs = torch.softmax(outputs.logits, dim=-1)
+                    chunk_probs.append(probs[0][0].item()) # Probability of 'fake'
+            # Authority calculation
+            # We take the MAX probability across chunks to catch 'slips' in AI generation
+            prob_fake = np.max(chunk_probs)
+            prob_real = 1.0 - prob_fake
             t1 = time.time()
+            print(f"DEBUG: Analysis took {t1 - t0:.3f}s. Multi-chunk prob_fake: {prob_fake:.4f}")
+            print(f"DEBUG: Features - PitchVar: {pitch_var:.1f}, Flatness: {flatness:.4f}, RMS_Var: {rms_var:.4f}")
+            # --- HYBRID HEURISTIC STRENGTHENING ---
+            # AI Voices often have VERY low flatness or VERY low pitch variance
+            ai_signal_flags = 0
+            if pitch_var < 15.0: ai_signal_flags += 1
+            if flatness < 0.005: ai_signal_flags += 1 # Very tonal/melodic
+            if rms_var < 0.1: ai_signal_flags += 1    # Robotic volume
+            # Final Verdict Logic
+            # If the model is unsure (0.4-0.6) but signal flags are high, tip to AI
+            if 0.4 < prob_fake < 0.6 and ai_signal_flags >= 2:
+                prob_fake = 0.75
+            prediction = "AI_GENERATED" if prob_fake > 0.5 else "HUMAN"
+            confidence = prob_fake if prediction == "AI_GENERATED" else prob_real
+            # --- LANGUAGE AWARENESS ---
+            # If it's a non-English language, the model might be slightly less reliable
+            # We dampen confidence on low-resource languages to prevent false accusations
+            is_english = language.lower() in ["english", "en"]
+            if not is_english and confidence < 0.85:
+                confidence *= 0.95 # Slight dampening
             # Construct Explanation
             if prediction == "AI_GENERATED":
+                reasons = []
+                if ai_signal_flags >= 2: reasons.append("synthetic spectral characteristics")
+                if pitch_var < 20: reasons.append("lack of natural prosody")
+                if not reasons: reasons.append("digital vocoder artifacts")
+                explanation = f"AI detected with {confidence*100:.1f}% confidence. Evidence: {', '.join(reasons)}."
             else:
+                if pitch_var > 25.0:
+                    explanation = f"Human verified with {confidence*100:.1f}% confidence. Strong natural pitch variance and human vocal dynamics detected."
                 else:
+                    explanation = f"Audio likely Human ({confidence*100:.1f}%). Detected natural speech fluctuations despite localized artifacts."
             return {
                 "prediction": prediction,
+                "probability_ai": float(f"{prob_fake:.4f}"),
                 "confidence": float(f"{confidence:.4f}"),
                 "features": {
+                    "pitch_variance": float(f"{pitch_var:.2f}"),
+                    "spectral_flatness": float(f"{flatness:.6f}"),
+                    "rms_variance": float(f"{rms_var:.4f}"),
+                    "zcr_variance": float(f"{zcr_var:.4f}")
                 },
                 "explanation": explanation
             }

app/main.py CHANGED Viewed

@@ -73,7 +73,7 @@ async def detect_voice(
              raise HTTPException(status_code=400, detail="Could not process audio.")
         # 4. Predict
-        result = classifier_instance.predict(waveform)
         if "error" in result:
              raise HTTPException(status_code=500, detail=result["error"])

              raise HTTPException(status_code=400, detail="Could not process audio.")
         # 4. Predict
+        result = classifier_instance.predict(waveform, language=request_data.language)
         if "error" in result:
              raise HTTPException(status_code=500, detail=result["error"])

test_full_audio.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import requests
+import json
+def test_full_audio():
+    # Complete Base64 string provided by user
+    b64_str = """SUQzAwAAAAAAIlRTU0UAAAAOAAAATGF2ZjYyLjMuMTAwAAAAAAAAAAAAAAD/++TAAAAAAAAAAAAAAAAAAAAAAABJbmZvAAAADwAAAxkAC6GAAAIFCAoNDxIUFxocHyEkJyksLjEzNjk7PkBCRUdKTU9SVFdZXF9hZGZpbG5xc3Z4e36AgoWHioyPkpSXmZyeoaSmqauusbO2uLu9wMLEx8rMz9HU19nc3uHj5unr7vDz9vj7/QAAAABMYXZjNjIuMTEAAAAAAAAAAAAAAAAkBCgAAAAAAAuhgKduISIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP/7xMQAA8AAAaQAAAAgBwBAAPAABAHVMQU1FMy4xMDBVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVAWbjOjxEaBThLYeQGrlm7LmPQgJMHUzSHhGPECo2A4VCmLEjJI0ARHwwYkVAGOEJeg4HGwKFIRBqSIgJDRgAhUD0jEcJhrlHHGuRFr9axKLKcjUW3S8UbYknQtAwwBUyKBcwDCFU2PF1E+EJ4KGMQHQY8PMAGQAIAFlpjqWAQgEFTPoyIuBBx+IFQqpGt4dNSgtQBiFUp5+LfRQbsFAAZj0jgi1j5BUokctWocBgIqJgKALRL9peKDucl4io3i511sTfsLlOeTvEOeAKHYixGotaSyAwJsGECxTucKQIqGd4Q0HXAkDPFg4YOAAUULVjAYMRlqIqYJEPRDTrSCAcoQZkaADxvjxiWJr0pnRKwABMhdccg8ABgjGmZQkxIiAmdWhCYHHDGozcIgMQMMEGSYGNGgKCMuZ8KZJ8eE6aJ0F85+z5zuQLCCBAa4YW7MMWBzwzbszI9GoxiA0gIybUzII358z1g4QE58cCnBpYg0ZAIoIDgYiCDwAClzZvwE+NIUMAOMMJOYyGA61hkSAkYgBKEq7BgorImMHJ8AAUXZFg6mpnWZQ/Na9A09MRkBhBSdJd8yK82Y0wJUSF2wK5nXpZl7GYuyG5IBx0IBb0+cnOenihrOLXTVH4zxSB0QQC4QLCADNjbxZVLSCoMYYKGPABiRIDgESGwYdDT+WrMv"""
+    # Clean up the Base64 string (remove any whitespace/newlines)
+    b64_str = b64_str.replace('\n', '').replace(' ', '').strip()
+    url = "http://127.0.0.1:7860/api/voice-detection"
+    headers = {
+        "x-api-key": "test-key-123",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "language": "English",
+        "audioFormat": "mp3",
+        "audioBase64": b64_str
+    }
+    print(f"📡 Testing with FULL audio sample")
+    print(f"Base64 length: {len(b64_str)} characters")
+    print(f"POSTing to {url}...")
+    print()
+    try:
+        response = requests.post(url, headers=headers, json=payload, timeout=30)
+        print(f"📥 Status Code: {response.status_code}")
+        print()
+        print("📄 Response JSON:")
+        print(json.dumps(response.json(), indent=2))
+    except requests.exceptions.Timeout:
+        print("❌ ERROR: Request timed out after 30 seconds")
+    except Exception as e:
+        print(f"❌ ERROR: {e}")
+if __name__ == "__main__":
+    test_provided_b64()