Spaces:

akagtag
/

deepdetection

Paused

akagtag commited on Apr 18

Commit

8cf40cb

1 Parent(s): 7250f83

feat: Enhance generator detection and attribution mechanisms

- Introduced NoveltyDetector for detecting unseen generators using a CLIP embedding ring buffer and IsolationForest.
- Added GeneratorRegistry for monitoring generator performance and retention, flagging those below a defined threshold.
- Updated FingerprintEngine to include DCT frequency analysis and improved generator attribution logic.
- Enhanced CoherenceEngine with audio lip-sync analysis, integrating it into the video processing pipeline.
- Implemented Dempster-Shafer evidence fusion in Fuser for more robust verdicts based on multiple engine outputs.
- Revised generator labels and their corresponding prompts to align with the updated taxonomy.
- Added support for audio coherence sub-scores and timestamp markers in detection responses.

Files changed (8) hide show

src/api/main.py +14 -7
src/continual/novelty_detector.py +131 -0
src/continual/registry.py +114 -0
src/engines/coherence/engine.py +173 -7
src/engines/fingerprint/engine.py +82 -15
src/engines/sstgnn/engine.py +66 -2
src/fusion/fuser.py +126 -32
src/types.py +25 -12

src/api/main.py CHANGED Viewed

@@ -17,6 +17,8 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import HTMLResponse
 from PIL import ExifTags, Image
 from src.engines.coherence.engine import CoherenceEngine
 from src.engines.fingerprint.engine import FingerprintEngine
 from src.engines.sstgnn.engine import SSTGNNEngine
@@ -63,6 +65,10 @@ _co = CoherenceEngine()
 _st = SSTGNNEngine()
 _hf = HFInferenceClient()
 MAX_IMAGE_MB = int(os.environ.get("MAX_IMAGE_SIZE_MB", 20))
 MAX_VIDEO_MB = int(os.environ.get("MAX_VIDEO_SIZE_MB", 100))
 MAX_FRAMES = int(os.environ.get("MAX_VIDEO_FRAMES", 300))
@@ -72,13 +78,14 @@ VIDEO_TYPES = {"video/mp4", "video/quicktime", "video/x-msvideo", "video/webm",
 SUPPORTED_GENERATORS = [
     "real",
-    "unknown_gan",
     "stable_diffusion",
     "midjourney",
     "dall_e",
-    "flux",
-    "firefly",
-    "imagen",
 ]
 SYNTHETIC_KEYWORDS = (
@@ -174,7 +181,7 @@ def _apply_metadata_keyword_signal(
             engine="metadata_signal",
             verdict="FAKE",
             confidence=0.98,
-            attributed_generator="unknown_gan",
             explanation=f"Filename/metadata contains synthetic keyword(s): {', '.join(hits)}.",
             processing_time_ms=0.0,
         )
@@ -189,7 +196,7 @@ def _apply_metadata_keyword_signal(
         flagged.verdict = "FAKE"
         flagged.confidence = max(flagged.confidence, 0.85)
         if flagged.attributed_generator == "real":
-            flagged.attributed_generator = "unknown_gan"
     return flagged
@@ -342,7 +349,7 @@ def _hf_generator_label(preds: list[dict], verdict: str) -> str:
             continue
         if candidate.replace("_", " ") in labels or candidate in labels:
             return candidate
-    return "unknown_gan"
 def _build_hf_response(preds: list[dict], elapsed_ms: float, media_type: str) -> DetectionResponse:

 from fastapi.responses import HTMLResponse
 from PIL import ExifTags, Image
+from src.continual.novelty_detector import NoveltyDetector
+from src.continual.registry import GeneratorRegistry
 from src.engines.coherence.engine import CoherenceEngine
 from src.engines.fingerprint.engine import FingerprintEngine
 from src.engines.sstgnn.engine import SSTGNNEngine
 _st = SSTGNNEngine()
 _hf = HFInferenceClient()
+# Module 4 — Continual Learning backbone (paper §III-D)
+_novelty_detector = NoveltyDetector(buffer_size=500, min_fit_size=50, refit_interval=25)
+_generator_registry = GeneratorRegistry()
 MAX_IMAGE_MB = int(os.environ.get("MAX_IMAGE_SIZE_MB", 20))
 MAX_VIDEO_MB = int(os.environ.get("MAX_VIDEO_SIZE_MB", 100))
 MAX_FRAMES = int(os.environ.get("MAX_VIDEO_FRAMES", 300))
 SUPPORTED_GENERATORS = [
     "real",
+    "sora",
+    "runway",
+    "wav2lip",
     "stable_diffusion",
+    "sdxl",
     "midjourney",
     "dall_e",
+    "unknown_generative",
 ]
 SYNTHETIC_KEYWORDS = (
             engine="metadata_signal",
             verdict="FAKE",
             confidence=0.98,
+            attributed_generator="unknown_generative",
             explanation=f"Filename/metadata contains synthetic keyword(s): {', '.join(hits)}.",
             processing_time_ms=0.0,
         )
         flagged.verdict = "FAKE"
         flagged.confidence = max(flagged.confidence, 0.85)
         if flagged.attributed_generator == "real":
+            flagged.attributed_generator = "unknown_generative"
     return flagged
             continue
         if candidate.replace("_", " ") in labels or candidate in labels:
             return candidate
+    return "unknown_generative"
 def _build_hf_response(preds: list[dict], elapsed_ms: float, media_type: str) -> DetectionResponse:

src/continual/novelty_detector.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""
+src/continual/novelty_detector.py — Novel-generator detection via CLIP ring buffer.
+Implements Epic 4 of the paper: a CLIP embedding ring buffer with an IsolationForest
+that detects when an input resembles a generator not seen during training.
+Architecture (paper Fig. 1, Epic 4):
+    CLIP embedding ring buffer → IsolationForest → novelty_score [0, 1]
+A high novelty_score indicates the input may come from a generator not yet
+indexed by the fingerprint module — this is the anti-Detector-Rot signal.
+"""
+from __future__ import annotations
+import logging
+import threading
+from collections import deque
+from typing import Optional
+import numpy as np
+logger = logging.getLogger(__name__)
+class NoveltyDetector:
+    """
+    CLIP embedding ring buffer + IsolationForest novelty detector.
+    After at least `min_fit_size` embeddings accumulate, an IsolationForest
+    is fitted on the buffer. Every subsequent embedding receives a novelty
+    score in [0, 1]. 0.5 is returned until the forest is ready.
+    Thread-safe: all public methods acquire the internal lock.
+    Parameters
+    ----------
+    buffer_size:    Maximum embeddings to retain (FIFO eviction).
+    min_fit_size:   Minimum buffer size before the first forest fit.
+    refit_interval: How many updates between successive refits.
+    contamination:  Expected outlier fraction (passed to IsolationForest).
+    """
+    def __init__(
+        self,
+        buffer_size: int = 500,
+        min_fit_size: int = 50,
+        refit_interval: int = 25,
+        contamination: float = 0.1,
+    ) -> None:
+        self._buffer: deque[np.ndarray] = deque(maxlen=buffer_size)
+        self._forest = None
+        self._lock = threading.Lock()
+        self._min_fit_size = min_fit_size
+        self._refit_interval = refit_interval
+        self._contamination = contamination
+        self._n_updates = 0
+    # ------------------------------------------------------------------
+    # Public
+    # ------------------------------------------------------------------
+    def update(self, clip_embedding: np.ndarray) -> float:
+        """
+        Add `clip_embedding` to the ring buffer and return a novelty score.
+        Returns 0.5 until the buffer has at least `min_fit_size` samples.
+        Parameters
+        ----------
+        clip_embedding: 1-D (or any shape, will be flattened) float32 array
+                        from CLIP's image encoder.
+        Returns
+        -------
+        novelty_score: float in [0, 1]. Higher = more novel (unseen generator).
+        """
+        with self._lock:
+            emb = clip_embedding.flatten().astype(np.float32)
+            self._buffer.append(emb)
+            self._n_updates += 1
+            n = len(self._buffer)
+            if n >= self._min_fit_size and self._n_updates % self._refit_interval == 0:
+                self._refit()
+            if self._forest is None or n < self._min_fit_size:
+                return 0.5
+            try:
+                # score_samples: more negative = more anomalous
+                raw = float(self._forest.score_samples([emb])[0])
+                # Typical range: [-0.5, 0.0]. Map to [0, 1].
+                novelty = float(np.clip((-raw - 0.1) / 0.4, 0.0, 1.0))
+                return novelty
+            except Exception as exc:
+                logger.warning("IsolationForest scoring error: %s", exc)
+                return 0.5
+    @property
+    def buffer_size(self) -> int:
+        """Current number of embeddings in the ring buffer."""
+        return len(self._buffer)
+    @property
+    def is_ready(self) -> bool:
+        """True once the IsolationForest has been fitted at least once."""
+        return self._forest is not None and len(self._buffer) >= self._min_fit_size
+    # ------------------------------------------------------------------
+    # Private
+    # ------------------------------------------------------------------
+    def _refit(self) -> None:
+        """Fit a fresh IsolationForest on all buffered embeddings."""
+        try:
+            from sklearn.ensemble import IsolationForest  # type: ignore
+            X = np.array(list(self._buffer), dtype=np.float32)
+            forest = IsolationForest(
+                contamination=self._contamination,
+                random_state=42,
+                n_estimators=50,  # lightweight — no GPU required
+            )
+            forest.fit(X)
+            self._forest = forest
+            logger.debug(
+                "NoveltyDetector: refitted IsolationForest on %d embeddings",
+                len(self._buffer),
+            )
+        except Exception as exc:
+            logger.warning("NoveltyDetector refit failed: %s", exc)

src/continual/registry.py CHANGED Viewed

@@ -22,6 +22,7 @@ from __future__ import annotations
 import json
 import logging
 import os
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
@@ -119,3 +120,116 @@ class TaskRegistry:
             json.dumps(tasks, indent=2, default=str),
             encoding="utf-8",
         )

 import json
 import logging
 import os
+import threading
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
             json.dumps(tasks, indent=2, default=str),
             encoding="utf-8",
         )
+# ---------------------------------------------------------------------------
+# GeneratorRegistry — retention-aware live monitoring registry
+# ---------------------------------------------------------------------------
+_DEFAULT_RETENTION_PATH = Path(
+    os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
+) / "generator_registry.json"
+RETENTION_THRESHOLD = 0.85  # paper §III-D: flag if retention drops below 85%
+class GeneratorRegistry:
+    """
+    Live monitoring registry for per-generator knowledge retention.
+    Tracks how well the system performs on each known generator class and
+    flags generators whose accuracy drops below the 85% retention threshold
+    (paper §III-D). Designed to be called by APScheduler every N hours.
+    The registry is persisted to a JSON file at MODEL_CACHE_DIR/generator_registry.json
+    and survives container restarts on HF Spaces' /data volume.
+    Parameters
+    ----------
+    path:
+        Path to the JSON persistence file. Defaults to MODEL_CACHE_DIR/generator_registry.json.
+    """
+    def __init__(self, path: Path | str | None = None) -> None:
+        self._path = Path(path) if path else _DEFAULT_RETENTION_PATH
+        self._lock = threading.Lock()
+    # ------------------------------------------------------------------
+    # Retention tracking
+    # ------------------------------------------------------------------
+    def record_prediction(self, generator_label: str, correct: bool) -> None:
+        """
+        Record whether a prediction for `generator_label` was correct.
+        Updates a running accuracy estimate using an exponential moving average.
+        """
+        with self._lock:
+            data = self._load()
+            entry = data.setdefault(generator_label, {"ema_accuracy": 1.0, "n_samples": 0, "flagged": False})
+            n = entry["n_samples"]
+            alpha = min(0.1, 2.0 / (n + 2))  # EMA decay; stabilises after ~20 samples
+            entry["ema_accuracy"] = (1 - alpha) * entry["ema_accuracy"] + alpha * (1.0 if correct else 0.0)
+            entry["n_samples"] += 1
+            entry["flagged"] = entry["ema_accuracy"] < RETENTION_THRESHOLD
+            self._save(data)
+    def retention_scores(self) -> dict[str, float]:
+        """Return {generator_label: ema_accuracy} for all tracked generators."""
+        return {k: v["ema_accuracy"] for k, v in self._load().items()}
+    def flagged_generators(self) -> list[str]:
+        """Return labels whose retention dropped below the 85% threshold."""
+        return [k for k, v in self._load().items() if v.get("flagged")]
+    def check_retention(self) -> None:
+        """
+        APScheduler job: log retention status and warn on degraded generators.
+        Called automatically on a schedule (e.g., every 6 hours).
+        Any generator below RETENTION_THRESHOLD is logged as a warning so that
+        operators can trigger a manual review cycle.
+        """
+        flagged = self.flagged_generators()
+        scores = self.retention_scores()
+        if not scores:
+            logger.info("GeneratorRegistry: no retention data recorded yet.")
+            return
+        logger.info(
+            "GeneratorRegistry retention check — %d generators tracked, %d flagged.",
+            len(scores),
+            len(flagged),
+        )
+        for label, acc in sorted(scores.items()):
+            level = logging.WARNING if acc < RETENTION_THRESHOLD else logging.DEBUG
+            logger.log(level, "  %s: EMA accuracy = %.1f%%", label, acc * 100)
+        if flagged:
+            logger.warning(
+                "Generators below %.0f%% retention threshold: %s. "
+                "Consider triggering an incremental update cycle.",
+                RETENTION_THRESHOLD * 100,
+                ", ".join(flagged),
+            )
+    # ------------------------------------------------------------------
+    # Internal
+    # ------------------------------------------------------------------
+    def _load(self) -> dict[str, Any]:
+        if not self._path.exists():
+            return {}
+        try:
+            return json.loads(self._path.read_text(encoding="utf-8"))
+        except json.JSONDecodeError:
+            logger.warning("GeneratorRegistry file corrupt; starting fresh.")
+            return {}
+    def _save(self, data: dict[str, Any]) -> None:
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        self._path.write_text(
+            json.dumps(data, indent=2, default=str),
+            encoding="utf-8",
+        )

src/engines/coherence/engine.py CHANGED Viewed

@@ -2,10 +2,13 @@ from __future__ import annotations
 import logging
 import os
 import threading
 import time
 import urllib.request
 from pathlib import Path
 import numpy as np
 from PIL import Image
@@ -194,7 +197,20 @@ class CoherenceEngine:
             logger.warning("Coherence image scoring failed: %s", exc)
             return 0.35
-    def run_video(self, frames: list[np.ndarray]) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
@@ -216,21 +232,171 @@ class CoherenceEngine:
         delta = self._embedding_variance(frames)
         jerk = self._landmark_jerk(frames)
         blink = self._blink_anomaly(frames)
-        score = float(np.clip(delta * 0.45 + jerk * 0.35 + blink * 0.20, 0.0, 1.0))
         return EngineResult(
             engine="coherence",
             verdict="FAKE" if score > 0.5 else "REAL",
             confidence=score,
             attributed_generator=None,
-            explanation=(
-                f"Embedding variance {delta:.2f}, "
-                f"landmark jerk {jerk:.2f}, "
-                f"blink anomaly {blink:.2f}."
-            ),
             processing_time_ms=(time.perf_counter() - t0) * 1000,
         )
     def _embedding_variance(self, frames: list[np.ndarray]) -> float:
         if _mtcnn is None or _resnet is None or _torch is None:
             return 0.5

 import logging
 import os
+import subprocess
+import tempfile
 import threading
 import time
 import urllib.request
 from pathlib import Path
+from typing import Optional
 import numpy as np
 from PIL import Image
             logger.warning("Coherence image scoring failed: %s", exc)
             return 0.35
+    def run_video(
+        self,
+        frames: list[np.ndarray],
+        video_path: Optional[str] = None,
+    ) -> EngineResult:
+        """
+        Temporal coherence analysis.
+        Args:
+            frames:     RGB frames extracted from the video.
+            video_path: Optional path to the source video file. When provided,
+                        audio is extracted and MFCC lip-sync cross-correlation
+                        is computed (paper Module 1 / LipFD extension).
+        """
         t0 = time.perf_counter()
         self._ensure()
         delta = self._embedding_variance(frames)
         jerk = self._landmark_jerk(frames)
         blink = self._blink_anomaly(frames)
+        visual_score = float(np.clip(delta * 0.45 + jerk * 0.35 + blink * 0.20, 0.0, 1.0))
+        # Audio lip-sync cross-correlation (LipFD-inspired, paper §III-A)
+        audio_anomaly: Optional[float] = None
+        timestamp_markers: list[dict] = []
+        if video_path is not None:
+            audio_anomaly, timestamp_markers = self._audio_lipsync_score(video_path, frames)
+        if audio_anomaly is not None:
+            # Weighted: visual 60%, audio 40% (paper weights for Module 1)
+            score = float(np.clip(visual_score * 0.60 + audio_anomaly * 0.40, 0.0, 1.0))
+            explanation = (
+                f"Embedding variance {delta:.2f}, landmark jerk {jerk:.2f}, "
+                f"blink anomaly {blink:.2f}. "
+                f"Audio lip-sync anomaly {audio_anomaly:.2f} "
+                f"({len(timestamp_markers)} flagged segment(s))."
+            )
+        else:
+            score = visual_score
+            explanation = (
+                f"Embedding variance {delta:.2f}, "
+                f"landmark jerk {jerk:.2f}, "
+                f"blink anomaly {blink:.2f}."
+            )
         return EngineResult(
             engine="coherence",
             verdict="FAKE" if score > 0.5 else "REAL",
             confidence=score,
             attributed_generator=None,
+            explanation=explanation,
             processing_time_ms=(time.perf_counter() - t0) * 1000,
+            audio_sync_score=audio_anomaly,
+            timestamp_markers=timestamp_markers,
+        )
+    def _audio_lipsync_score(
+        self,
+        video_path: str,
+        frames: list[np.ndarray],
+    ) -> tuple[float, list[dict]]:
+        """
+        MFCC cross-correlation with lip-aperture motion curve (paper §III-A).
+        Extracts mono 16 kHz audio via ffmpeg, computes MFCC energy envelope,
+        computes per-frame lip-aperture from MediaPipe, resamples both to the
+        same length, and returns the Pearson correlation as an anomaly score.
+        Returns:
+            (sync_anomaly_score, timestamp_markers)
+            sync_anomaly_score: 0 = perfectly in sync, 1 = totally out of sync
+            timestamp_markers: list of {start_s, end_s, correlation} dicts for
+                               segments where correlation < 0.2
+        """
+        try:
+            import librosa  # type: ignore
+            from scipy.stats import pearsonr  # type: ignore
+        except ImportError as exc:
+            logger.warning("Audio analysis unavailable (missing dep): %s", exc)
+            return 0.35, []
+        audio_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        audio_path = audio_tmp.name
+        audio_tmp.close()
+        try:
+            cmd = [
+                "ffmpeg", "-i", video_path,
+                "-ac", "1", "-ar", "16000",
+                "-vn",           # no video output
+                "-f", "wav",
+                audio_path,
+                "-y", "-loglevel", "error",
+            ]
+            result = subprocess.run(cmd, capture_output=True, timeout=30)
+            if result.returncode != 0:
+                logger.debug("ffmpeg audio extract returned %d (no audio?)", result.returncode)
+                return 0.35, []
+            try:
+                y, sr = librosa.load(audio_path, sr=16000, mono=True)
+            except Exception as exc:
+                logger.warning("librosa load failed: %s", exc)
+                return 0.35, []
+        finally:
+            Path(audio_path).unlink(missing_ok=True)
+        if len(y) < sr * 0.5:
+            return 0.35, []  # less than 0.5 s of audio → inconclusive
+        # Audio energy envelope from MFCC
+        hop_length = 512
+        try:
+            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length)
+            audio_curve = np.mean(np.abs(mfcc), axis=0).astype(np.float32)
+        except Exception as exc:
+            logger.warning("MFCC computation failed: %s", exc)
+            return 0.35, []
+        # Lip-aperture curve from MediaPipe (inner upper lip=13, lower=14)
+        if _face_mesh is None:
+            return 0.35, []
+        lip_apertures: list[float] = []
+        for frame in frames:
+            try:
+                res = _face_mesh.process(frame)
+                if res.multi_face_landmarks:
+                    lm = res.multi_face_landmarks[0].landmark
+                    h, w = frame.shape[:2]
+                    upper = np.array([lm[13].x * w, lm[13].y * h], dtype=np.float32)
+                    lower = np.array([lm[14].x * w, lm[14].y * h], dtype=np.float32)
+                    lip_apertures.append(float(np.linalg.norm(upper - lower)))
+                else:
+                    lip_apertures.append(0.0)
+            except Exception:
+                lip_apertures.append(0.0)
+        if len(lip_apertures) < 4 or float(np.std(lip_apertures)) < 1e-6:
+            return 0.35, []  # static lip → can't measure sync
+        # Resample lip curve to match audio_curve length
+        lip_curve = np.array(lip_apertures, dtype=np.float32)
+        target_len = len(audio_curve)
+        lip_resampled = np.interp(
+            np.linspace(0, len(lip_curve) - 1, target_len),
+            np.arange(len(lip_curve)),
+            lip_curve,
         )
+        if target_len < 4:
+            return 0.35, []
+        # Overall Pearson correlation
+        try:
+            r_overall, _ = pearsonr(audio_curve, lip_resampled)
+        except Exception:
+            r_overall = 0.0
+        # Map correlation → anomaly score
+        # Real speech: r typically > 0.3; deepfake: often < 0.1 or negative
+        sync_anomaly = float(np.clip((0.3 - float(r_overall)) / 0.5 + 0.35, 0.0, 1.0))
+        # Sliding-window timestamp markers for low-correlation segments
+        hop_s = hop_length / sr  # seconds per MFCC frame
+        markers: list[dict] = []
+        window = max(10, target_len // 10)
+        stride = max(1, window // 2)
+        for i in range(0, target_len - window, stride):
+            seg_audio = audio_curve[i : i + window]
+            seg_lip = lip_resampled[i : i + window]
+            try:
+                r_seg, _ = pearsonr(seg_audio, seg_lip)
+            except Exception:
+                continue
+            if float(r_seg) < 0.2:
+                markers.append({
+                    "start_s": round(i * hop_s, 2),
+                    "end_s": round((i + window) * hop_s, 2),
+                    "correlation": round(float(r_seg), 3),
+                })
+        return sync_anomaly, markers
     def _embedding_variance(self, frames: list[np.ndarray]) -> float:
         if _mtcnn is None or _resnet is None or _torch is None:
             return 0.5

src/engines/fingerprint/engine.py CHANGED Viewed

@@ -29,14 +29,15 @@ DETECTOR_CANDIDATES = [
 ]
 GENERATOR_PROMPTS: dict[str, str] = {
-    "real": "a real photograph taken by a camera with natural lighting and grain",
-    "unknown_gan": "a GAN-generated image with checkerboard artifacts and blurry edges",
-    "stable_diffusion": "an image generated by Stable Diffusion with painterly soft textures and dreamlike quality",
-    "midjourney": "an image generated by Midjourney with cinematic dramatic lighting and extreme hyperdetail",
-    "dall_e": "an image generated by DALL-E with clean flat illustration style and smooth gradients",
-    "flux": "an image generated by FLUX with photorealistic high-frequency detail and sharp textures",
-    "firefly": "an image generated by Adobe Firefly with polished commercial stock-photo aesthetics",
-    "imagen": "an image generated by Google Imagen with precise photorealistic rendering and clean edges",
 }
 FAKE_LABEL_KEYWORDS = (
@@ -68,6 +69,10 @@ _clip_model: Optional[CLIPModel] = None
 _clip_processor: Optional[CLIPProcessor] = None
 _loaded = False
 def _get_pipeline():
     try:
@@ -195,7 +200,12 @@ class FingerprintEngine:
             except Exception as exc:
                 logger.warning("Detector %s inference error: %s", model_id, _short_error(exc))
-        fake_score = (weighted_fake / total_w) if total_w > 0 else 0.5
         generator = self._attribute_generator(image, fake_score)
         return EngineResult(
@@ -204,7 +214,7 @@ class FingerprintEngine:
             confidence=float(fake_score),
             attributed_generator=generator,
             explanation=(
-                f"Ensemble fake score {fake_score:.2f} across {len(_detectors)} detectors. "
                 f"Generator attributed to: {generator}."
             ),
             processing_time_ms=(time.perf_counter() - t0) * 1000,
@@ -212,7 +222,8 @@ class FingerprintEngine:
     def _attribute_generator(self, image: Image.Image, fake_score: float) -> str:
         if _clip_model is None or _clip_processor is None:
-            return "unknown_gan" if fake_score > 0.5 else "real"
         try:
             texts = list(GENERATOR_PROMPTS.values())
@@ -225,18 +236,74 @@ class FingerprintEngine:
                 max_length=77,
             )
             with torch.no_grad():
-                logits = _clip_model(**inputs).logits_per_image[0]
             probs = logits.softmax(dim=0).cpu().numpy()
-            generator = list(GENERATOR_PROMPTS.keys())[int(np.argmax(probs))]
             if fake_score > 0.65 and generator == "real":
-                generator = "unknown_gan"
             if fake_score < 0.35 and generator != "real":
                 generator = "real"
             return generator
         except Exception as exc:
             logger.warning("CLIP attribution error: %s", _short_error(exc))
-            return "unknown_gan" if fake_score > 0.5 else "real"
     def run_video(self, frames: list) -> EngineResult:
         t0 = time.perf_counter()

 ]
 GENERATOR_PROMPTS: dict[str, str] = {
+    "real":               "a real photograph taken by a camera with natural lighting and film grain",
+    "sora":               "a Sora text-to-video frame with temporal coherence and photorealistic lighting",
+    "runway":             "a Runway Gen-2 frame with painterly dreamlike motion blur and color grading",
+    "wav2lip":            "a Wav2Lip face-swap with sharp lip boundary artifacts and texture inconsistency at mouth edges",
+    "stable_diffusion":   "an image generated by Stable Diffusion with painterly soft textures and dreamlike quality",
+    "sdxl":               "an image generated by SDXL with high resolution detail, sharp edges and crisp textures",
+    "midjourney":         "an image generated by Midjourney with cinematic dramatic lighting and extreme hyperdetail",
+    "dall_e":             "an image generated by DALL-E with clean flat illustration style and smooth gradients",
+    "unknown_generative": "an AI-generated image with unidentifiable generator-specific artifacts and synthetic patterns",
 }
 FAKE_LABEL_KEYWORDS = (
 _clip_processor: Optional[CLIPProcessor] = None
 _loaded = False
+# Thread-local storage: each request thread stores its last CLIP embedding here
+# so the novelty detector can consume it without a second forward pass.
+_thread_local = threading.local()
 def _get_pipeline():
     try:
             except Exception as exc:
                 logger.warning("Detector %s inference error: %s", model_id, _short_error(exc))
+        ensemble_score = (weighted_fake / total_w) if total_w > 0 else 0.5
+        # DCT frequency band analysis (paper §III-B / Kim et al.)
+        dct_score = self._dct_frequency_score(image)
+        fake_score = float(np.clip(ensemble_score * 0.85 + dct_score * 0.15, 0.0, 1.0))
         generator = self._attribute_generator(image, fake_score)
         return EngineResult(
             confidence=float(fake_score),
             attributed_generator=generator,
             explanation=(
+                f"Ensemble {ensemble_score:.2f} × 0.85 + DCT {dct_score:.2f} × 0.15 = {fake_score:.2f}. "
                 f"Generator attributed to: {generator}."
             ),
             processing_time_ms=(time.perf_counter() - t0) * 1000,
     def _attribute_generator(self, image: Image.Image, fake_score: float) -> str:
         if _clip_model is None or _clip_processor is None:
+            _thread_local.last_clip_embedding = None
+            return "unknown_generative" if fake_score > 0.5 else "real"
         try:
             texts = list(GENERATOR_PROMPTS.values())
                 max_length=77,
             )
             with torch.no_grad():
+                outputs = _clip_model(**inputs)
+                logits = outputs.logits_per_image[0]
+                # Store image embedding for novelty detection
+                image_embeds = outputs.image_embeds.detach().cpu().numpy()[0]
+                _thread_local.last_clip_embedding = image_embeds
             probs = logits.softmax(dim=0).cpu().numpy()
+            max_prob = float(np.max(probs))
+            # Low confidence attribution → unknown generator
+            if max_prob < 0.25:
+                generator = "unknown_generative"
+            else:
+                generator = list(GENERATOR_PROMPTS.keys())[int(np.argmax(probs))]
             if fake_score > 0.65 and generator == "real":
+                generator = "unknown_generative"
             if fake_score < 0.35 and generator != "real":
                 generator = "real"
             return generator
         except Exception as exc:
             logger.warning("CLIP attribution error: %s", _short_error(exc))
+            _thread_local.last_clip_embedding = None
+            return "unknown_generative" if fake_score > 0.5 else "real"
+    def _dct_frequency_score(self, image: Image.Image) -> float:
+        """
+        DCT frequency band analysis (paper §III-B).
+        High-frequency energy ratio is an anomaly signal: real photos follow
+        a predictable DCT energy roll-off; AI generators often deviate.
+        Returns float [0, 1] where higher = more anomalous.
+        """
+        try:
+            from scipy.fft import dctn  # type: ignore
+            gray = np.array(image.convert("L"), dtype=np.float32)
+            h, w = gray.shape
+            # Align to 8×8 block boundary (JPEG-DCT standard)
+            bh, bw = h - h % 8, w - w % 8
+            if bh < 8 or bw < 8:
+                return 0.3
+            crop = gray[:bh, :bw]
+            # Reshape into (n_blocks_h, n_blocks_w, 8, 8) then DCT each 8×8 block
+            blocks = crop.reshape(bh // 8, 8, bw // 8, 8).transpose(0, 2, 1, 3)
+            n_bh, n_bw = blocks.shape[:2]
+            dc_energy_total = 0.0
+            all_energy_total = 0.0
+            for bi in range(n_bh):
+                for bj in range(n_bw):
+                    dct_block = dctn(blocks[bi, bj], norm="ortho")
+                    dc_energy_total += float(dct_block[0, 0] ** 2)
+                    all_energy_total += float(np.sum(dct_block ** 2))
+            if all_energy_total < 1e-9:
+                return 0.3
+            ac_ratio = 1.0 - (dc_energy_total / all_energy_total)
+            # Real photos: ac_ratio ≈ 0.80–0.90; AI images can deviate significantly
+            score = float(np.clip(abs(ac_ratio - 0.85) / 0.15, 0.0, 1.0))
+            return score
+        except Exception as exc:
+            logger.warning("DCT frequency score error: %s", _short_error(exc))
+            return 0.3
+    def get_last_clip_embedding(self) -> Optional[np.ndarray]:
+        """Return the CLIP image embedding from the most recent run() call in this thread."""
+        return getattr(_thread_local, "last_clip_embedding", None)
     def run_video(self, frames: list) -> EngineResult:
         t0 = time.perf_counter()

src/engines/sstgnn/engine.py CHANGED Viewed

@@ -303,6 +303,61 @@ class SSTGNNEngine:
             logger.warning("Geometry score error: %s", exc)
             return 0.3
     def run_video(self, frames: list[np.ndarray]) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
@@ -319,14 +374,23 @@ class SSTGNNEngine:
         sample = frames[::6] or [frames[0]]
         results = [self.run(Image.fromarray(frame)) for frame in sample]
-        avg = float(np.mean([r.confidence for r in results]))
         return EngineResult(
             engine="sstgnn",
             verdict="FAKE" if avg > 0.5 else "REAL",
             confidence=avg,
             attributed_generator=None,
-            explanation=f"Frame-sampled SSTGNN average {avg:.2f} over {len(sample)} frames.",
             processing_time_ms=(time.perf_counter() - t0) * 1000,
         )

             logger.warning("Geometry score error: %s", exc)
             return 0.3
+    def _temporal_fft_score(self, frames: list[np.ndarray]) -> float:
+        """
+        Pixel-wise 1D FFT over the time axis (paper §III-C / Kim et al. [7]).
+        For each pixel position in a 32×32 downsampled grid, the 1D FFT is
+        computed across T frame samples. Real video concentrates energy in the
+        DC component (slow, smooth motion). Deepfakes often exhibit elevated
+        high-frequency temporal components due to frame-level inconsistencies.
+        Returns float [0, 1] where higher = more anomalous.
+        """
+        try:
+            import cv2  # type: ignore
+            if len(frames) < 8:
+                return 0.3
+            # Sample up to 32 frames evenly
+            step = max(1, len(frames) // 32)
+            sampled = frames[::step][:32]
+            if len(sampled) < 4:
+                return 0.3
+            # Downsample each frame to 32×32 grayscale float32
+            gray_stack = np.array(
+                [
+                    cv2.resize(
+                        cv2.cvtColor(f, cv2.COLOR_RGB2GRAY)
+                        if (f.ndim == 3 and f.shape[2] >= 3)
+                        else f[:, :, 0] if f.ndim == 3 else f,
+                        (32, 32),
+                    ).astype(np.float32)
+                    for f in sampled
+                ]
+            )  # shape: (T, 32, 32)
+            # 1D real FFT along time axis
+            fft_result = np.fft.rfft(gray_stack, axis=0)  # (T//2+1, 32, 32)
+            power = np.abs(fft_result) ** 2                # power spectrum
+            dc_power = power[0]                                    # (32, 32)
+            total_power = np.sum(power, axis=0) + 1e-9            # (32, 32)
+            hf_ratio = 1.0 - (dc_power / total_power)             # per-pixel HF ratio
+            mean_hf = float(np.mean(hf_ratio))
+            # Real video: mean_hf ≈ 0.20–0.40 (most energy in slow motion).
+            # Deepfakes deviate in either direction (flickering >0.55 or
+            # unnaturally smooth <0.10). Centre of normal range = 0.30.
+            score = float(np.clip(abs(mean_hf - 0.30) / 0.25, 0.0, 1.0))
+            return score
+        except Exception as exc:
+            logger.warning("Temporal FFT score error: %s", _short_error(exc))
+            return 0.3
     def run_video(self, frames: list[np.ndarray]) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
         sample = frames[::6] or [frames[0]]
         results = [self.run(Image.fromarray(frame)) for frame in sample]
+        cnn_geo_avg = float(np.mean([r.confidence for r in results]))
+        # Pixel-wise temporal FFT (paper §III-C / Kim et al. [7])
+        fft_score = self._temporal_fft_score(frames)
+        # Final: CNN+geometry 80%, temporal FFT 20%
+        avg = float(np.clip(cnn_geo_avg * 0.80 + fft_score * 0.20, 0.0, 1.0))
         return EngineResult(
             engine="sstgnn",
             verdict="FAKE" if avg > 0.5 else "REAL",
             confidence=avg,
             attributed_generator=None,
+            explanation=(
+                f"CNN+geometry avg {cnn_geo_avg:.2f} over {len(sample)} frames, "
+                f"temporal FFT anomaly {fft_score:.2f}."
+            ),
             processing_time_ms=(time.perf_counter() - t0) * 1000,
         )

src/fusion/fuser.py CHANGED Viewed

@@ -1,27 +1,47 @@
 from __future__ import annotations
 import numpy as np
 from src.types import DetectionResponse, EngineResult
-ENGINE_WEIGHTS = {
-    "fingerprint": 0.45,
-    "coherence": 0.35,
-    "sstgnn": 0.20,
 }
-ENGINE_WEIGHTS_VIDEO = {
-    "fingerprint": 0.30,
-    "coherence": 0.50,
-    "sstgnn": 0.20,
 }
-ATTRIBUTION_PRIORITY = {
     "fingerprint": 1,
-    "sstgnn": 2,
-    "coherence": 3,
 }
 def _normalize_generator(value: str | None) -> str:
     if not value:
@@ -29,31 +49,103 @@ def _normalize_generator(value: str | None) -> str:
     return str(value).strip().lower().replace(" ", "_")
 def fuse(results: list[EngineResult], is_video: bool = False) -> tuple[str, float, str]:
-    """Return (verdict, confidence_for_verdict, attributed_generator)."""
-    weights = ENGINE_WEIGHTS_VIDEO if is_video else ENGINE_WEIGHTS
-    active = [result for result in results if result.verdict != "UNKNOWN"]
     if not active:
-        return "UNKNOWN", 0.5, "unknown_gan"
-    wf = sum(
-        result.confidence * weights.get(result.engine, 0.1)
-        for result in active
-        if result.verdict == "FAKE"
-    )
-    wr = sum(
-        (1.0 - result.confidence) * weights.get(result.engine, 0.1)
-        for result in active
-        if result.verdict == "REAL"
-    )
-    denom = wf + wr + 1e-9
-    fake_prob = float(np.clip(wf / denom, 0.0, 1.0))
     verdict = "FAKE" if fake_prob > 0.5 else "REAL"
     confidence = fake_prob if verdict == "FAKE" else (1.0 - fake_prob)
     generator = "real"
     if verdict == "FAKE":
         for result in sorted(active, key=lambda r: ATTRIBUTION_PRIORITY.get(r.engine, 9)):
@@ -62,9 +154,9 @@ def fuse(results: list[EngineResult], is_video: bool = False) -> tuple[str, floa
                 generator = candidate
                 break
         if generator == "real":
-            generator = "unknown_gan"
-    return verdict, confidence, generator
 class Fuser:
@@ -80,7 +172,7 @@ class Fuser:
             return DetectionResponse(
                 verdict="REAL",
                 confidence=0.5,
-                attributed_generator="unknown_gan",
                 explanation="No engine results available.",
                 processing_time_ms=round(total_ms, 2),
                 engine_breakdown=[],
@@ -95,7 +187,9 @@ class Fuser:
                 f"{result.engine}:{result.verdict}({result.confidence:.2f})"
                 for result in results
             )
-            explanation = f"Fused {media_type} analysis from engines: {summary}."
         return DetectionResponse(
             verdict=verdict,

+"""
+src/fusion/fuser.py — Multi-engine evidence fusion.
+Implements Dempster-Shafer (DS) evidence theory combination of the three
+detection engine outputs (paper §III-E / Module 5).
+DS replaces the previous simple weighted average. Each engine produces a
+Basic Probability Assignment (BPA) over {FAKE, REAL, Θ} where Θ is the
+set of all hypotheses (total ignorance). DS combination normalises away
+the conflict between contradictory masses, yielding a combined BPA that
+reflects consensus while respecting uncertainty.
+The final confidence is derived via the pignistic probability transform
+(Smets), which distributes the ignorance mass equally between FAKE and REAL.
+"""
 from __future__ import annotations
 import numpy as np
 from src.types import DetectionResponse, EngineResult
+# Engine reliability weights used to build each engine's BPA.
+# Higher weight → engine commits more mass to its verdict, less to Θ.
+ENGINE_RELIABILITY: dict[str, float] = {
+    "fingerprint": 0.70,
+    "coherence":   0.65,
+    "sstgnn":      0.60,
 }
+ENGINE_RELIABILITY_VIDEO: dict[str, float] = {
+    "fingerprint": 0.55,
+    "coherence":   0.75,
+    "sstgnn":      0.65,
 }
+# Attribution priority: which engine's generator label is most trusted
+ATTRIBUTION_PRIORITY: dict[str, int] = {
     "fingerprint": 1,
+    "sstgnn":      2,
+    "coherence":   3,
 }
+# Type alias for a Basic Probability Assignment over {FAKE, REAL, Θ}
+_BPA = dict[str, float]
 def _normalize_generator(value: str | None) -> str:
     if not value:
     return str(value).strip().lower().replace(" ", "_")
+def _engine_to_bpa(result: EngineResult, is_video: bool = False) -> _BPA:
+    """
+    Convert an EngineResult into a Basic Probability Assignment.
+    The engine reliability weight (w) determines how much mass is committed
+    to the engine's verdict vs. left as ignorance (Θ).
+    BPA structure:
+        m({FAKE}) + m({REAL}) + m(Θ) = 1.0
+    """
+    weights = ENGINE_RELIABILITY_VIDEO if is_video else ENGINE_RELIABILITY
+    w = weights.get(result.engine, 0.50)
+    c = float(result.confidence)
+    if result.verdict == "UNKNOWN":
+        return {"FAKE": 0.0, "REAL": 0.0, "Θ": 1.0}
+    if result.verdict == "FAKE":
+        return {
+            "FAKE": c * w,
+            "REAL": (1.0 - c) * w,
+            "Θ":    1.0 - w,
+        }
+    # verdict == "REAL"
+    return {
+        "REAL": c * w,
+        "FAKE": (1.0 - c) * w,
+        "Θ":    1.0 - w,
+    }
+def _ds_combine(m1: _BPA, m2: _BPA) -> _BPA:
+    """
+    Dempster's combination rule for two BPAs over {FAKE, REAL, Θ}.
+    K = conflict = Σ_{A∩B=∅} m1(A)·m2(B)
+    m12(C) = Σ_{A∩B=C} m1(A)·m2(B) / (1 - K)   for C ≠ ∅
+    """
+    # Conflict mass: FAKE ∩ REAL = ∅, so conflict = FAKE×REAL + REAL×FAKE
+    K = m1["FAKE"] * m2["REAL"] + m1["REAL"] * m2["FAKE"]
+    # Unnormalised joint masses
+    raw_fake = (
+        m1["FAKE"] * m2["FAKE"]    # FAKE ∩ FAKE = FAKE
+        + m1["FAKE"] * m2["Θ"]    # FAKE ∩ Θ    = FAKE
+        + m1["Θ"]   * m2["FAKE"]  # Θ    ∩ FAKE = FAKE
+    )
+    raw_real = (
+        m1["REAL"] * m2["REAL"]
+        + m1["REAL"] * m2["Θ"]
+        + m1["Θ"]   * m2["REAL"]
+    )
+    raw_theta = m1["Θ"] * m2["Θ"]  # Θ ∩ Θ = Θ
+    norm = 1.0 - K
+    if norm < 1e-9:
+        # Total conflict → maximum uncertainty
+        return {"FAKE": 0.5, "REAL": 0.5, "Θ": 0.0}
+    return {
+        "FAKE": raw_fake  / norm,
+        "REAL": raw_real  / norm,
+        "Θ":    raw_theta / norm,
+    }
 def fuse(results: list[EngineResult], is_video: bool = False) -> tuple[str, float, str]:
+    """
+    Dempster-Shafer fusion of engine results.
+    Returns (verdict, confidence_for_verdict, attributed_generator).
+    Confidence is derived via the pignistic probability transform (Smets 1990):
+    ignorance mass Θ is split equally between FAKE and REAL before thresholding.
+    This avoids overconfident verdicts when engines disagree.
+    """
+    active = [r for r in results if r.verdict != "UNKNOWN"]
     if not active:
+        return "UNKNOWN", 0.5, "unknown_generative"
+    # Build and combine BPAs iteratively
+    bpas = [_engine_to_bpa(r, is_video) for r in active]
+    combined = bpas[0]
+    for bpa in bpas[1:]:
+        combined = _ds_combine(combined, bpa)
+    # Pignistic transform: distribute Θ mass equally
+    theta = combined.get("Θ", 0.0)
+    pign_fake = combined["FAKE"] + theta / 2.0
+    pign_real = combined["REAL"] + theta / 2.0
+    pign_total = pign_fake + pign_real + 1e-9
+    fake_prob = float(np.clip(pign_fake / pign_total, 0.0, 1.0))
     verdict = "FAKE" if fake_prob > 0.5 else "REAL"
     confidence = fake_prob if verdict == "FAKE" else (1.0 - fake_prob)
+    # Generator attribution: highest-priority engine with a non-real label
     generator = "real"
     if verdict == "FAKE":
         for result in sorted(active, key=lambda r: ATTRIBUTION_PRIORITY.get(r.engine, 9)):
                 generator = candidate
                 break
         if generator == "real":
+            generator = "unknown_generative"
+    return verdict, float(np.clip(confidence, 0.0, 1.0)), generator
 class Fuser:
             return DetectionResponse(
                 verdict="REAL",
                 confidence=0.5,
+                attributed_generator="unknown_generative",
                 explanation="No engine results available.",
                 processing_time_ms=round(total_ms, 2),
                 engine_breakdown=[],
                 f"{result.engine}:{result.verdict}({result.confidence:.2f})"
                 for result in results
             )
+            explanation = (
+                f"Dempster-Shafer fusion ({media_type}) from engines: {summary}."
+            )
         return DetectionResponse(
             verdict=verdict,

src/types.py CHANGED Viewed

@@ -12,27 +12,29 @@ from pydantic import BaseModel, field_validator
 class GeneratorLabel(str, Enum):
-    """Generator attribution labels used across the pipeline."""
     real = "real"
-    unknown_gan = "unknown_gan"
     stable_diffusion = "stable_diffusion"
     midjourney = "midjourney"
     dall_e = "dall_e"
-    flux = "flux"
-    firefly = "firefly"
-    imagen = "imagen"
 GENERATOR_INDEX_TO_LABEL: dict[int, GeneratorLabel] = {
     0: GeneratorLabel.real,
-    1: GeneratorLabel.unknown_gan,
-    2: GeneratorLabel.stable_diffusion,
-    3: GeneratorLabel.midjourney,
-    4: GeneratorLabel.dall_e,
-    5: GeneratorLabel.flux,
-    6: GeneratorLabel.firefly,
-    7: GeneratorLabel.imagen,
 }
@@ -46,6 +48,10 @@ class EngineResult(BaseModel):
     explanation: str = ""
     processing_time_ms: float = 0.0
     @field_validator("confidence")
     @classmethod
     def confidence_in_range(cls, value: float) -> float:
@@ -71,6 +77,13 @@ class DetectionResponse(BaseModel):
     processing_time_ms: float
     engine_breakdown: list[EngineResult]
     # Optional explainability metadata
     clarity_score: Optional[float] = None
     saliency_map_url: Optional[str] = None

 class GeneratorLabel(str, Enum):
+    """Generator attribution labels — aligned to paper's 8-generator taxonomy."""
     real = "real"
+    sora = "sora"
+    runway = "runway"
+    wav2lip = "wav2lip"
     stable_diffusion = "stable_diffusion"
+    sdxl = "sdxl"
     midjourney = "midjourney"
     dall_e = "dall_e"
+    unknown_generative = "unknown_generative"
 GENERATOR_INDEX_TO_LABEL: dict[int, GeneratorLabel] = {
     0: GeneratorLabel.real,
+    1: GeneratorLabel.sora,
+    2: GeneratorLabel.runway,
+    3: GeneratorLabel.wav2lip,
+    4: GeneratorLabel.stable_diffusion,
+    5: GeneratorLabel.sdxl,
+    6: GeneratorLabel.midjourney,
+    7: GeneratorLabel.dall_e,
+    8: GeneratorLabel.unknown_generative,
 }
     explanation: str = ""
     processing_time_ms: float = 0.0
+    # Audio coherence sub-scores (populated by CoherenceEngine on video input)
+    audio_sync_score: Optional[float] = None
+    timestamp_markers: list[dict] = []
     @field_validator("confidence")
     @classmethod
     def confidence_in_range(cls, value: float) -> float:
     processing_time_ms: float
     engine_breakdown: list[EngineResult]
+    # Module 4 — Continual Learning novelty signal
+    novelty_score: Optional[float] = None
+    # Module 1 — Audio lip-sync coherence sub-scores
+    audio_sync_score: Optional[float] = None
+    timestamp_markers: list[dict] = []
     # Optional explainability metadata
     clarity_score: Optional[float] = None
     saliency_map_url: Optional[str] = None