Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on Mar 20

Commit

2b9cb11

verified ·

1 Parent(s): 7fe6676

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +95 -50

pipeline.py CHANGED Viewed

@@ -9,6 +9,14 @@ import numpy as np
 import tensorflow as tf
 from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 # Set random seed for reproducibility.
 tf.random.set_seed(42)
@@ -29,32 +37,17 @@ efficientnet_model = tf.keras.layers.TFSMLayer(
 # ─────────────────────────────────────────────────────────────────────────────
 # Audio Ensemble: 3 models vote — majority wins
-#
-# Model 1: mo-thecreator/Deepfake-audio-detection
-#   Wav2Vec2-base, trained on real/fake speech, 98.82% accuracy
-#
-# Model 2: MelodyMachine/Deepfake-audio-detection-V2
-#   Fine-tuned from mo-thecreator, 99.73% accuracy on evaluation
-#
-# Model 3: Gustking/wav2vec2-large-xlsr-deepfake-audio-classification
-#   Wav2Vec2-large-xlsr, bigger multilingual model, more robust
-#
-# Voting logic:
-#   Each model casts a vote: "real", "ai_synth", or "fake"
-#   Final result = whichever label gets the most votes (majority)
-#   Tie on real vs fake → AI Synthesized (safest middle ground)
 # ─────────────────────────────────────────────────────────────────────────────
 AUDIO_MODELS = [
-    "MelodyMachine/Deepfake-audio-detection-V2",   # Highest accuracy
-    "MelodyMachine/Deepfake-audio-detection",       # Second best
-    "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification",  # Most robust to real-world audio
 ]
 AUDIO_SAMPLE_RATE = 16000
-# ─── Thresholds (applied per model before voting) ────────────────────────────
-REAL_THRESHOLD = 0.50   # real_prob >= 0.50 → vote "real"
-FAKE_THRESHOLD = 0.90   # fake_prob >= 0.90 → vote "fake"
-                        # anything between  → vote "ai_synth"
 print("Loading audio ensemble models ...")
 ensemble = []
@@ -73,7 +66,6 @@ print(f"Ensemble ready with {len(ensemble)} models.")
 def convert_to_mp4(input_path):
-    """Convert any video to .mp4 using ffmpeg (handles webcam .webm, etc.)"""
     ext = os.path.splitext(input_path)[-1].lower()
     if ext == ".mp4":
         cap = cv2.VideoCapture(input_path)
@@ -99,8 +91,6 @@ def convert_to_mp4(input_path):
 class DetectionPipeline:
-    """Pipeline for detecting faces in video frames or processing images."""
     def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
         self.n_frames = n_frames
         self.batch_size = batch_size
@@ -197,13 +187,76 @@ def deepfakes_image_predict(input_image):
         return "🚨 The image is FAKE."
-def get_real_fake_probs(probs, id2label: dict):
     """
-    Map model output probabilities → real/fake floats.
-    Handles all known label naming conventions.
     """
     real_prob, fake_prob = None, None
     for idx, prob in enumerate(probs):
         label = id2label[idx].lower().strip()
         if label in ("real", "label_1", "genuine", "bonafide", "1"):
@@ -211,7 +264,6 @@ def get_real_fake_probs(probs, id2label: dict):
         elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
             fake_prob = float(prob)
-    # Fallback: 0=fake, 1=real
     if real_prob is None or fake_prob is None:
         print("[Audio] Warning: unknown labels — falling back to probs[0]=fake, probs[1]=real")
         fake_prob = float(probs[0])
@@ -221,10 +273,6 @@ def get_real_fake_probs(probs, id2label: dict):
 def single_model_vote(x, entry):
-    """
-    Run one model and return its vote: 'real', 'ai_synth', or 'fake'
-    along with the real/fake probabilities.
-    """
     model_id = entry["id"]
     fe = entry["extractor"]
     m = entry["model"]
@@ -257,30 +305,27 @@ def single_model_vote(x, entry):
 def deepfakes_audio_predict(input_audio):
     """
-    Ensemble audio deepfake detection.
-    All loaded models vote — majority wins.
     Gradio gr.Audio() returns (sample_rate, numpy_array).
     """
     sr, x = input_audio
     print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
-    # Step 1 — float32 + normalise
-    x = x.astype(np.float32)
-    if np.abs(x).max() > 1.0:
-        x = x / 32768.0
-    # Step 2 — stereo → mono
-    if x.ndim == 2:
-        x = x.mean(axis=1)
-    # Step 3 — resample to 16 kHz
-    if sr != AUDIO_SAMPLE_RATE:
-        print(f"[Audio] Resampling {sr} Hz → {AUDIO_SAMPLE_RATE} Hz …")
-        x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
-        print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
-    # Step 4 — each model votes
     votes = {"real": 0, "ai_synth": 0, "fake": 0}
     all_real_probs = []
     all_fake_probs = []
@@ -299,11 +344,11 @@ def deepfakes_audio_predict(input_audio):
     if len(all_real_probs) == 0:
         return "⚠️ All models failed. Please try again."
-    # Step 5 — majority vote decision
     max_votes = max(votes.values())
     winners = [label for label, count in votes.items() if count == max_votes]
-    # Tie-break: real > ai_synth > fake (bias toward safety)
     if "real" in winners:
         final = "real"
     elif "ai_synth" in winners:

 import tensorflow as tf
 from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
+try:
+    import noisereduce as nr
+    NOISEREDUCE_AVAILABLE = True
+    print("noisereduce available — live recording denoising enabled.")
+except ImportError:
+    NOISEREDUCE_AVAILABLE = False
+    print("noisereduce not available — skipping denoising.")
 # Set random seed for reproducibility.
 tf.random.set_seed(42)
 # ─────────────────────────────────────────────────────────────────────────────
 # Audio Ensemble: 3 models vote — majority wins
 # ─────────────────────────────────────────────────────────────────────────────
 AUDIO_MODELS = [
+    "MelodyMachine/Deepfake-audio-detection-V2",
+    "MelodyMachine/Deepfake-audio-detection",
+    "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification",
 ]
 AUDIO_SAMPLE_RATE = 16000
+# ─── Thresholds ───────────────────────────────────────────────────────────────
+REAL_THRESHOLD = 0.50
+FAKE_THRESHOLD = 0.90
 print("Loading audio ensemble models ...")
 ensemble = []
 def convert_to_mp4(input_path):
     ext = os.path.splitext(input_path)[-1].lower()
     if ext == ".mp4":
         cap = cv2.VideoCapture(input_path)
 class DetectionPipeline:
     def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
         self.n_frames = n_frames
         self.batch_size = batch_size
         return "🚨 The image is FAKE."
+def preprocess_audio(x: np.ndarray, sr: int, is_live: bool) -> np.ndarray:
     """
+    Preprocessing pipeline with extra steps for live microphone recordings.
+    Uploaded file:
+      float32 → mono → resample → normalize
+    Live recording (extra steps):
+      float32 → mono → resample → denoise → normalize → trim silence
     """
+    # Step 1 — float32 + int16 normalise
+    x = x.astype(np.float32)
+    if np.abs(x).max() > 1.0:
+        x = x / 32768.0
+    # Step 2 — stereo → mono
+    if x.ndim == 2:
+        x = x.mean(axis=1)
+    # Step 3 — resample to 16 kHz
+    if sr != AUDIO_SAMPLE_RATE:
+        print(f"[Audio] Resampling {sr} Hz → {AUDIO_SAMPLE_RATE} Hz …")
+        x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
+    if is_live:
+        print("[Audio] Live recording detected — applying enhanced preprocessing …")
+        # Step 4 — Noise reduction
+        # Uses first 0.5s as noise profile (usually silence before speaking)
+        if NOISEREDUCE_AVAILABLE and len(x) > AUDIO_SAMPLE_RATE // 2:
+            noise_sample = x[:AUDIO_SAMPLE_RATE // 2]
+            x = nr.reduce_noise(
+                y=x,
+                sr=AUDIO_SAMPLE_RATE,
+                y_noise=noise_sample,
+                prop_decrease=0.75,   # aggressive but not total noise removal
+                stationary=False      # handles non-stationary noise (room noise)
+            )
+            print("[Audio] Noise reduction applied.")
+        # Step 5 — Trim leading/trailing silence
+        # Live recordings often have silence at start/end before/after speaking
+        x, _ = librosa.effects.trim(
+            x,
+            top_db=20,        # anything 20dB below peak = silence
+            frame_length=512,
+            hop_length=128
+        )
+        print(f"[Audio] After trim: {len(x)} samples ({len(x)/AUDIO_SAMPLE_RATE:.2f}s)")
+        # Step 6 — Peak normalize to -3dBFS
+        # Live mics often record too quietly, which confuses the model
+        peak = np.abs(x).max()
+        if peak > 0:
+            x = x / peak * 0.707   # normalize to ~-3dBFS
+        print("[Audio] Peak normalization applied.")
+    # Final check — must have at least 0.5s of audio
+    min_samples = AUDIO_SAMPLE_RATE // 2
+    if len(x) < min_samples:
+        x = np.pad(x, (0, min_samples - len(x)), mode='constant')
+    return x
+def get_real_fake_probs(probs, id2label: dict):
     real_prob, fake_prob = None, None
+    print(f"[Audio] id2label: {id2label}")
     for idx, prob in enumerate(probs):
         label = id2label[idx].lower().strip()
         if label in ("real", "label_1", "genuine", "bonafide", "1"):
         elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
             fake_prob = float(prob)
     if real_prob is None or fake_prob is None:
         print("[Audio] Warning: unknown labels — falling back to probs[0]=fake, probs[1]=real")
         fake_prob = float(probs[0])
 def single_model_vote(x, entry):
     model_id = entry["id"]
     fe = entry["extractor"]
     m = entry["model"]
 def deepfakes_audio_predict(input_audio):
     """
+    Detect whether audio is: Real Human Voice / AI Synthesized / Fake.
     Gradio gr.Audio() returns (sample_rate, numpy_array).
+    Detects if input is live recording or uploaded file and applies
+    appropriate preprocessing accordingly.
     """
     sr, x = input_audio
     print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
+    # ── Detect if live recording ──────────────────────────────────────────────
+    # Live recordings from browser microphone typically arrive at 48000 Hz.
+    # Uploaded files can be any sample rate but are rarely exactly 48000.
+    # Duration under 30s also strongly suggests live recording.
+    duration = len(x) / sr
+    is_live = (sr == 48000 and duration < 30.0)
+    print(f"[Audio] Source: {'🎙️ Live recording' if is_live else '📁 Uploaded file'} | duration={duration:.2f}s")
+    # ── Preprocess ────────────────────────────────────────────────────────────
+    x = preprocess_audio(x, sr, is_live)
+    # ── Ensemble voting ───────────────────────────────────────────────────────
     votes = {"real": 0, "ai_synth": 0, "fake": 0}
     all_real_probs = []
     all_fake_probs = []
     if len(all_real_probs) == 0:
         return "⚠️ All models failed. Please try again."
+    # ── Majority vote with tie-break ──────────────────────────────────────────
     max_votes = max(votes.values())
     winners = [label for label, count in votes.items() if count == max_votes]
+    # Tie-break: bias toward real to avoid false positives on genuine voices
     if "real" in winners:
         final = "real"
     elif "ai_synth" in winners: