Spaces:
Paused
Paused
feat: Enhance generator detection and attribution mechanisms
Browse files- Introduced NoveltyDetector for detecting unseen generators using a CLIP embedding ring buffer and IsolationForest.
- Added GeneratorRegistry for monitoring generator performance and retention, flagging those below a defined threshold.
- Updated FingerprintEngine to include DCT frequency analysis and improved generator attribution logic.
- Enhanced CoherenceEngine with audio lip-sync analysis, integrating it into the video processing pipeline.
- Implemented Dempster-Shafer evidence fusion in Fuser for more robust verdicts based on multiple engine outputs.
- Revised generator labels and their corresponding prompts to align with the updated taxonomy.
- Added support for audio coherence sub-scores and timestamp markers in detection responses.
- src/api/main.py +14 -7
- src/continual/novelty_detector.py +131 -0
- src/continual/registry.py +114 -0
- src/engines/coherence/engine.py +173 -7
- src/engines/fingerprint/engine.py +82 -15
- src/engines/sstgnn/engine.py +66 -2
- src/fusion/fuser.py +126 -32
- src/types.py +25 -12
src/api/main.py
CHANGED
|
@@ -17,6 +17,8 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
| 17 |
from fastapi.responses import HTMLResponse
|
| 18 |
from PIL import ExifTags, Image
|
| 19 |
|
|
|
|
|
|
|
| 20 |
from src.engines.coherence.engine import CoherenceEngine
|
| 21 |
from src.engines.fingerprint.engine import FingerprintEngine
|
| 22 |
from src.engines.sstgnn.engine import SSTGNNEngine
|
|
@@ -63,6 +65,10 @@ _co = CoherenceEngine()
|
|
| 63 |
_st = SSTGNNEngine()
|
| 64 |
_hf = HFInferenceClient()
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
MAX_IMAGE_MB = int(os.environ.get("MAX_IMAGE_SIZE_MB", 20))
|
| 67 |
MAX_VIDEO_MB = int(os.environ.get("MAX_VIDEO_SIZE_MB", 100))
|
| 68 |
MAX_FRAMES = int(os.environ.get("MAX_VIDEO_FRAMES", 300))
|
|
@@ -72,13 +78,14 @@ VIDEO_TYPES = {"video/mp4", "video/quicktime", "video/x-msvideo", "video/webm",
|
|
| 72 |
|
| 73 |
SUPPORTED_GENERATORS = [
|
| 74 |
"real",
|
| 75 |
-
"
|
|
|
|
|
|
|
| 76 |
"stable_diffusion",
|
|
|
|
| 77 |
"midjourney",
|
| 78 |
"dall_e",
|
| 79 |
-
"
|
| 80 |
-
"firefly",
|
| 81 |
-
"imagen",
|
| 82 |
]
|
| 83 |
|
| 84 |
SYNTHETIC_KEYWORDS = (
|
|
@@ -174,7 +181,7 @@ def _apply_metadata_keyword_signal(
|
|
| 174 |
engine="metadata_signal",
|
| 175 |
verdict="FAKE",
|
| 176 |
confidence=0.98,
|
| 177 |
-
attributed_generator="
|
| 178 |
explanation=f"Filename/metadata contains synthetic keyword(s): {', '.join(hits)}.",
|
| 179 |
processing_time_ms=0.0,
|
| 180 |
)
|
|
@@ -189,7 +196,7 @@ def _apply_metadata_keyword_signal(
|
|
| 189 |
flagged.verdict = "FAKE"
|
| 190 |
flagged.confidence = max(flagged.confidence, 0.85)
|
| 191 |
if flagged.attributed_generator == "real":
|
| 192 |
-
flagged.attributed_generator = "
|
| 193 |
|
| 194 |
return flagged
|
| 195 |
|
|
@@ -342,7 +349,7 @@ def _hf_generator_label(preds: list[dict], verdict: str) -> str:
|
|
| 342 |
continue
|
| 343 |
if candidate.replace("_", " ") in labels or candidate in labels:
|
| 344 |
return candidate
|
| 345 |
-
return "
|
| 346 |
|
| 347 |
|
| 348 |
def _build_hf_response(preds: list[dict], elapsed_ms: float, media_type: str) -> DetectionResponse:
|
|
|
|
| 17 |
from fastapi.responses import HTMLResponse
|
| 18 |
from PIL import ExifTags, Image
|
| 19 |
|
| 20 |
+
from src.continual.novelty_detector import NoveltyDetector
|
| 21 |
+
from src.continual.registry import GeneratorRegistry
|
| 22 |
from src.engines.coherence.engine import CoherenceEngine
|
| 23 |
from src.engines.fingerprint.engine import FingerprintEngine
|
| 24 |
from src.engines.sstgnn.engine import SSTGNNEngine
|
|
|
|
| 65 |
_st = SSTGNNEngine()
|
| 66 |
_hf = HFInferenceClient()
|
| 67 |
|
| 68 |
+
# Module 4 — Continual Learning backbone (paper §III-D)
|
| 69 |
+
_novelty_detector = NoveltyDetector(buffer_size=500, min_fit_size=50, refit_interval=25)
|
| 70 |
+
_generator_registry = GeneratorRegistry()
|
| 71 |
+
|
| 72 |
MAX_IMAGE_MB = int(os.environ.get("MAX_IMAGE_SIZE_MB", 20))
|
| 73 |
MAX_VIDEO_MB = int(os.environ.get("MAX_VIDEO_SIZE_MB", 100))
|
| 74 |
MAX_FRAMES = int(os.environ.get("MAX_VIDEO_FRAMES", 300))
|
|
|
|
| 78 |
|
| 79 |
SUPPORTED_GENERATORS = [
|
| 80 |
"real",
|
| 81 |
+
"sora",
|
| 82 |
+
"runway",
|
| 83 |
+
"wav2lip",
|
| 84 |
"stable_diffusion",
|
| 85 |
+
"sdxl",
|
| 86 |
"midjourney",
|
| 87 |
"dall_e",
|
| 88 |
+
"unknown_generative",
|
|
|
|
|
|
|
| 89 |
]
|
| 90 |
|
| 91 |
SYNTHETIC_KEYWORDS = (
|
|
|
|
| 181 |
engine="metadata_signal",
|
| 182 |
verdict="FAKE",
|
| 183 |
confidence=0.98,
|
| 184 |
+
attributed_generator="unknown_generative",
|
| 185 |
explanation=f"Filename/metadata contains synthetic keyword(s): {', '.join(hits)}.",
|
| 186 |
processing_time_ms=0.0,
|
| 187 |
)
|
|
|
|
| 196 |
flagged.verdict = "FAKE"
|
| 197 |
flagged.confidence = max(flagged.confidence, 0.85)
|
| 198 |
if flagged.attributed_generator == "real":
|
| 199 |
+
flagged.attributed_generator = "unknown_generative"
|
| 200 |
|
| 201 |
return flagged
|
| 202 |
|
|
|
|
| 349 |
continue
|
| 350 |
if candidate.replace("_", " ") in labels or candidate in labels:
|
| 351 |
return candidate
|
| 352 |
+
return "unknown_generative"
|
| 353 |
|
| 354 |
|
| 355 |
def _build_hf_response(preds: list[dict], elapsed_ms: float, media_type: str) -> DetectionResponse:
|
src/continual/novelty_detector.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
src/continual/novelty_detector.py — Novel-generator detection via CLIP ring buffer.
|
| 3 |
+
|
| 4 |
+
Implements Epic 4 of the paper: a CLIP embedding ring buffer with an IsolationForest
|
| 5 |
+
that detects when an input resembles a generator not seen during training.
|
| 6 |
+
|
| 7 |
+
Architecture (paper Fig. 1, Epic 4):
|
| 8 |
+
CLIP embedding ring buffer → IsolationForest → novelty_score [0, 1]
|
| 9 |
+
|
| 10 |
+
A high novelty_score indicates the input may come from a generator not yet
|
| 11 |
+
indexed by the fingerprint module — this is the anti-Detector-Rot signal.
|
| 12 |
+
"""
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import logging
|
| 16 |
+
import threading
|
| 17 |
+
from collections import deque
|
| 18 |
+
from typing import Optional
|
| 19 |
+
|
| 20 |
+
import numpy as np
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class NoveltyDetector:
|
| 26 |
+
"""
|
| 27 |
+
CLIP embedding ring buffer + IsolationForest novelty detector.
|
| 28 |
+
|
| 29 |
+
After at least `min_fit_size` embeddings accumulate, an IsolationForest
|
| 30 |
+
is fitted on the buffer. Every subsequent embedding receives a novelty
|
| 31 |
+
score in [0, 1]. 0.5 is returned until the forest is ready.
|
| 32 |
+
|
| 33 |
+
Thread-safe: all public methods acquire the internal lock.
|
| 34 |
+
|
| 35 |
+
Parameters
|
| 36 |
+
----------
|
| 37 |
+
buffer_size: Maximum embeddings to retain (FIFO eviction).
|
| 38 |
+
min_fit_size: Minimum buffer size before the first forest fit.
|
| 39 |
+
refit_interval: How many updates between successive refits.
|
| 40 |
+
contamination: Expected outlier fraction (passed to IsolationForest).
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(
|
| 44 |
+
self,
|
| 45 |
+
buffer_size: int = 500,
|
| 46 |
+
min_fit_size: int = 50,
|
| 47 |
+
refit_interval: int = 25,
|
| 48 |
+
contamination: float = 0.1,
|
| 49 |
+
) -> None:
|
| 50 |
+
self._buffer: deque[np.ndarray] = deque(maxlen=buffer_size)
|
| 51 |
+
self._forest = None
|
| 52 |
+
self._lock = threading.Lock()
|
| 53 |
+
self._min_fit_size = min_fit_size
|
| 54 |
+
self._refit_interval = refit_interval
|
| 55 |
+
self._contamination = contamination
|
| 56 |
+
self._n_updates = 0
|
| 57 |
+
|
| 58 |
+
# ------------------------------------------------------------------
|
| 59 |
+
# Public
|
| 60 |
+
# ------------------------------------------------------------------
|
| 61 |
+
|
| 62 |
+
def update(self, clip_embedding: np.ndarray) -> float:
|
| 63 |
+
"""
|
| 64 |
+
Add `clip_embedding` to the ring buffer and return a novelty score.
|
| 65 |
+
|
| 66 |
+
Returns 0.5 until the buffer has at least `min_fit_size` samples.
|
| 67 |
+
|
| 68 |
+
Parameters
|
| 69 |
+
----------
|
| 70 |
+
clip_embedding: 1-D (or any shape, will be flattened) float32 array
|
| 71 |
+
from CLIP's image encoder.
|
| 72 |
+
|
| 73 |
+
Returns
|
| 74 |
+
-------
|
| 75 |
+
novelty_score: float in [0, 1]. Higher = more novel (unseen generator).
|
| 76 |
+
"""
|
| 77 |
+
with self._lock:
|
| 78 |
+
emb = clip_embedding.flatten().astype(np.float32)
|
| 79 |
+
self._buffer.append(emb)
|
| 80 |
+
self._n_updates += 1
|
| 81 |
+
|
| 82 |
+
n = len(self._buffer)
|
| 83 |
+
if n >= self._min_fit_size and self._n_updates % self._refit_interval == 0:
|
| 84 |
+
self._refit()
|
| 85 |
+
|
| 86 |
+
if self._forest is None or n < self._min_fit_size:
|
| 87 |
+
return 0.5
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
# score_samples: more negative = more anomalous
|
| 91 |
+
raw = float(self._forest.score_samples([emb])[0])
|
| 92 |
+
# Typical range: [-0.5, 0.0]. Map to [0, 1].
|
| 93 |
+
novelty = float(np.clip((-raw - 0.1) / 0.4, 0.0, 1.0))
|
| 94 |
+
return novelty
|
| 95 |
+
except Exception as exc:
|
| 96 |
+
logger.warning("IsolationForest scoring error: %s", exc)
|
| 97 |
+
return 0.5
|
| 98 |
+
|
| 99 |
+
@property
|
| 100 |
+
def buffer_size(self) -> int:
|
| 101 |
+
"""Current number of embeddings in the ring buffer."""
|
| 102 |
+
return len(self._buffer)
|
| 103 |
+
|
| 104 |
+
@property
|
| 105 |
+
def is_ready(self) -> bool:
|
| 106 |
+
"""True once the IsolationForest has been fitted at least once."""
|
| 107 |
+
return self._forest is not None and len(self._buffer) >= self._min_fit_size
|
| 108 |
+
|
| 109 |
+
# ------------------------------------------------------------------
|
| 110 |
+
# Private
|
| 111 |
+
# ------------------------------------------------------------------
|
| 112 |
+
|
| 113 |
+
def _refit(self) -> None:
|
| 114 |
+
"""Fit a fresh IsolationForest on all buffered embeddings."""
|
| 115 |
+
try:
|
| 116 |
+
from sklearn.ensemble import IsolationForest # type: ignore
|
| 117 |
+
|
| 118 |
+
X = np.array(list(self._buffer), dtype=np.float32)
|
| 119 |
+
forest = IsolationForest(
|
| 120 |
+
contamination=self._contamination,
|
| 121 |
+
random_state=42,
|
| 122 |
+
n_estimators=50, # lightweight — no GPU required
|
| 123 |
+
)
|
| 124 |
+
forest.fit(X)
|
| 125 |
+
self._forest = forest
|
| 126 |
+
logger.debug(
|
| 127 |
+
"NoveltyDetector: refitted IsolationForest on %d embeddings",
|
| 128 |
+
len(self._buffer),
|
| 129 |
+
)
|
| 130 |
+
except Exception as exc:
|
| 131 |
+
logger.warning("NoveltyDetector refit failed: %s", exc)
|
src/continual/registry.py
CHANGED
|
@@ -22,6 +22,7 @@ from __future__ import annotations
|
|
| 22 |
import json
|
| 23 |
import logging
|
| 24 |
import os
|
|
|
|
| 25 |
from datetime import datetime, timezone
|
| 26 |
from pathlib import Path
|
| 27 |
from typing import Any
|
|
@@ -119,3 +120,116 @@ class TaskRegistry:
|
|
| 119 |
json.dumps(tasks, indent=2, default=str),
|
| 120 |
encoding="utf-8",
|
| 121 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
import json
|
| 23 |
import logging
|
| 24 |
import os
|
| 25 |
+
import threading
|
| 26 |
from datetime import datetime, timezone
|
| 27 |
from pathlib import Path
|
| 28 |
from typing import Any
|
|
|
|
| 120 |
json.dumps(tasks, indent=2, default=str),
|
| 121 |
encoding="utf-8",
|
| 122 |
)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# ---------------------------------------------------------------------------
|
| 126 |
+
# GeneratorRegistry — retention-aware live monitoring registry
|
| 127 |
+
# ---------------------------------------------------------------------------
|
| 128 |
+
|
| 129 |
+
_DEFAULT_RETENTION_PATH = Path(
|
| 130 |
+
os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
|
| 131 |
+
) / "generator_registry.json"
|
| 132 |
+
|
| 133 |
+
RETENTION_THRESHOLD = 0.85 # paper §III-D: flag if retention drops below 85%
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
class GeneratorRegistry:
|
| 137 |
+
"""
|
| 138 |
+
Live monitoring registry for per-generator knowledge retention.
|
| 139 |
+
|
| 140 |
+
Tracks how well the system performs on each known generator class and
|
| 141 |
+
flags generators whose accuracy drops below the 85% retention threshold
|
| 142 |
+
(paper §III-D). Designed to be called by APScheduler every N hours.
|
| 143 |
+
|
| 144 |
+
The registry is persisted to a JSON file at MODEL_CACHE_DIR/generator_registry.json
|
| 145 |
+
and survives container restarts on HF Spaces' /data volume.
|
| 146 |
+
|
| 147 |
+
Parameters
|
| 148 |
+
----------
|
| 149 |
+
path:
|
| 150 |
+
Path to the JSON persistence file. Defaults to MODEL_CACHE_DIR/generator_registry.json.
|
| 151 |
+
"""
|
| 152 |
+
|
| 153 |
+
def __init__(self, path: Path | str | None = None) -> None:
|
| 154 |
+
self._path = Path(path) if path else _DEFAULT_RETENTION_PATH
|
| 155 |
+
self._lock = threading.Lock()
|
| 156 |
+
|
| 157 |
+
# ------------------------------------------------------------------
|
| 158 |
+
# Retention tracking
|
| 159 |
+
# ------------------------------------------------------------------
|
| 160 |
+
|
| 161 |
+
def record_prediction(self, generator_label: str, correct: bool) -> None:
|
| 162 |
+
"""
|
| 163 |
+
Record whether a prediction for `generator_label` was correct.
|
| 164 |
+
|
| 165 |
+
Updates a running accuracy estimate using an exponential moving average.
|
| 166 |
+
"""
|
| 167 |
+
with self._lock:
|
| 168 |
+
data = self._load()
|
| 169 |
+
entry = data.setdefault(generator_label, {"ema_accuracy": 1.0, "n_samples": 0, "flagged": False})
|
| 170 |
+
n = entry["n_samples"]
|
| 171 |
+
alpha = min(0.1, 2.0 / (n + 2)) # EMA decay; stabilises after ~20 samples
|
| 172 |
+
entry["ema_accuracy"] = (1 - alpha) * entry["ema_accuracy"] + alpha * (1.0 if correct else 0.0)
|
| 173 |
+
entry["n_samples"] += 1
|
| 174 |
+
entry["flagged"] = entry["ema_accuracy"] < RETENTION_THRESHOLD
|
| 175 |
+
self._save(data)
|
| 176 |
+
|
| 177 |
+
def retention_scores(self) -> dict[str, float]:
|
| 178 |
+
"""Return {generator_label: ema_accuracy} for all tracked generators."""
|
| 179 |
+
return {k: v["ema_accuracy"] for k, v in self._load().items()}
|
| 180 |
+
|
| 181 |
+
def flagged_generators(self) -> list[str]:
|
| 182 |
+
"""Return labels whose retention dropped below the 85% threshold."""
|
| 183 |
+
return [k for k, v in self._load().items() if v.get("flagged")]
|
| 184 |
+
|
| 185 |
+
def check_retention(self) -> None:
|
| 186 |
+
"""
|
| 187 |
+
APScheduler job: log retention status and warn on degraded generators.
|
| 188 |
+
|
| 189 |
+
Called automatically on a schedule (e.g., every 6 hours).
|
| 190 |
+
Any generator below RETENTION_THRESHOLD is logged as a warning so that
|
| 191 |
+
operators can trigger a manual review cycle.
|
| 192 |
+
"""
|
| 193 |
+
flagged = self.flagged_generators()
|
| 194 |
+
scores = self.retention_scores()
|
| 195 |
+
|
| 196 |
+
if not scores:
|
| 197 |
+
logger.info("GeneratorRegistry: no retention data recorded yet.")
|
| 198 |
+
return
|
| 199 |
+
|
| 200 |
+
logger.info(
|
| 201 |
+
"GeneratorRegistry retention check — %d generators tracked, %d flagged.",
|
| 202 |
+
len(scores),
|
| 203 |
+
len(flagged),
|
| 204 |
+
)
|
| 205 |
+
for label, acc in sorted(scores.items()):
|
| 206 |
+
level = logging.WARNING if acc < RETENTION_THRESHOLD else logging.DEBUG
|
| 207 |
+
logger.log(level, " %s: EMA accuracy = %.1f%%", label, acc * 100)
|
| 208 |
+
|
| 209 |
+
if flagged:
|
| 210 |
+
logger.warning(
|
| 211 |
+
"Generators below %.0f%% retention threshold: %s. "
|
| 212 |
+
"Consider triggering an incremental update cycle.",
|
| 213 |
+
RETENTION_THRESHOLD * 100,
|
| 214 |
+
", ".join(flagged),
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
# ------------------------------------------------------------------
|
| 218 |
+
# Internal
|
| 219 |
+
# ------------------------------------------------------------------
|
| 220 |
+
|
| 221 |
+
def _load(self) -> dict[str, Any]:
|
| 222 |
+
if not self._path.exists():
|
| 223 |
+
return {}
|
| 224 |
+
try:
|
| 225 |
+
return json.loads(self._path.read_text(encoding="utf-8"))
|
| 226 |
+
except json.JSONDecodeError:
|
| 227 |
+
logger.warning("GeneratorRegistry file corrupt; starting fresh.")
|
| 228 |
+
return {}
|
| 229 |
+
|
| 230 |
+
def _save(self, data: dict[str, Any]) -> None:
|
| 231 |
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
| 232 |
+
self._path.write_text(
|
| 233 |
+
json.dumps(data, indent=2, default=str),
|
| 234 |
+
encoding="utf-8",
|
| 235 |
+
)
|
src/engines/coherence/engine.py
CHANGED
|
@@ -2,10 +2,13 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import logging
|
| 4 |
import os
|
|
|
|
|
|
|
| 5 |
import threading
|
| 6 |
import time
|
| 7 |
import urllib.request
|
| 8 |
from pathlib import Path
|
|
|
|
| 9 |
|
| 10 |
import numpy as np
|
| 11 |
from PIL import Image
|
|
@@ -194,7 +197,20 @@ class CoherenceEngine:
|
|
| 194 |
logger.warning("Coherence image scoring failed: %s", exc)
|
| 195 |
return 0.35
|
| 196 |
|
| 197 |
-
def run_video(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
t0 = time.perf_counter()
|
| 199 |
self._ensure()
|
| 200 |
|
|
@@ -216,21 +232,171 @@ class CoherenceEngine:
|
|
| 216 |
delta = self._embedding_variance(frames)
|
| 217 |
jerk = self._landmark_jerk(frames)
|
| 218 |
blink = self._blink_anomaly(frames)
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
return EngineResult(
|
| 222 |
engine="coherence",
|
| 223 |
verdict="FAKE" if score > 0.5 else "REAL",
|
| 224 |
confidence=score,
|
| 225 |
attributed_generator=None,
|
| 226 |
-
explanation=
|
| 227 |
-
f"Embedding variance {delta:.2f}, "
|
| 228 |
-
f"landmark jerk {jerk:.2f}, "
|
| 229 |
-
f"blink anomaly {blink:.2f}."
|
| 230 |
-
),
|
| 231 |
processing_time_ms=(time.perf_counter() - t0) * 1000,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
)
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
def _embedding_variance(self, frames: list[np.ndarray]) -> float:
|
| 235 |
if _mtcnn is None or _resnet is None or _torch is None:
|
| 236 |
return 0.5
|
|
|
|
| 2 |
|
| 3 |
import logging
|
| 4 |
import os
|
| 5 |
+
import subprocess
|
| 6 |
+
import tempfile
|
| 7 |
import threading
|
| 8 |
import time
|
| 9 |
import urllib.request
|
| 10 |
from pathlib import Path
|
| 11 |
+
from typing import Optional
|
| 12 |
|
| 13 |
import numpy as np
|
| 14 |
from PIL import Image
|
|
|
|
| 197 |
logger.warning("Coherence image scoring failed: %s", exc)
|
| 198 |
return 0.35
|
| 199 |
|
| 200 |
+
def run_video(
|
| 201 |
+
self,
|
| 202 |
+
frames: list[np.ndarray],
|
| 203 |
+
video_path: Optional[str] = None,
|
| 204 |
+
) -> EngineResult:
|
| 205 |
+
"""
|
| 206 |
+
Temporal coherence analysis.
|
| 207 |
+
|
| 208 |
+
Args:
|
| 209 |
+
frames: RGB frames extracted from the video.
|
| 210 |
+
video_path: Optional path to the source video file. When provided,
|
| 211 |
+
audio is extracted and MFCC lip-sync cross-correlation
|
| 212 |
+
is computed (paper Module 1 / LipFD extension).
|
| 213 |
+
"""
|
| 214 |
t0 = time.perf_counter()
|
| 215 |
self._ensure()
|
| 216 |
|
|
|
|
| 232 |
delta = self._embedding_variance(frames)
|
| 233 |
jerk = self._landmark_jerk(frames)
|
| 234 |
blink = self._blink_anomaly(frames)
|
| 235 |
+
visual_score = float(np.clip(delta * 0.45 + jerk * 0.35 + blink * 0.20, 0.0, 1.0))
|
| 236 |
+
|
| 237 |
+
# Audio lip-sync cross-correlation (LipFD-inspired, paper §III-A)
|
| 238 |
+
audio_anomaly: Optional[float] = None
|
| 239 |
+
timestamp_markers: list[dict] = []
|
| 240 |
+
if video_path is not None:
|
| 241 |
+
audio_anomaly, timestamp_markers = self._audio_lipsync_score(video_path, frames)
|
| 242 |
+
|
| 243 |
+
if audio_anomaly is not None:
|
| 244 |
+
# Weighted: visual 60%, audio 40% (paper weights for Module 1)
|
| 245 |
+
score = float(np.clip(visual_score * 0.60 + audio_anomaly * 0.40, 0.0, 1.0))
|
| 246 |
+
explanation = (
|
| 247 |
+
f"Embedding variance {delta:.2f}, landmark jerk {jerk:.2f}, "
|
| 248 |
+
f"blink anomaly {blink:.2f}. "
|
| 249 |
+
f"Audio lip-sync anomaly {audio_anomaly:.2f} "
|
| 250 |
+
f"({len(timestamp_markers)} flagged segment(s))."
|
| 251 |
+
)
|
| 252 |
+
else:
|
| 253 |
+
score = visual_score
|
| 254 |
+
explanation = (
|
| 255 |
+
f"Embedding variance {delta:.2f}, "
|
| 256 |
+
f"landmark jerk {jerk:.2f}, "
|
| 257 |
+
f"blink anomaly {blink:.2f}."
|
| 258 |
+
)
|
| 259 |
|
| 260 |
return EngineResult(
|
| 261 |
engine="coherence",
|
| 262 |
verdict="FAKE" if score > 0.5 else "REAL",
|
| 263 |
confidence=score,
|
| 264 |
attributed_generator=None,
|
| 265 |
+
explanation=explanation,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
processing_time_ms=(time.perf_counter() - t0) * 1000,
|
| 267 |
+
audio_sync_score=audio_anomaly,
|
| 268 |
+
timestamp_markers=timestamp_markers,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
def _audio_lipsync_score(
|
| 272 |
+
self,
|
| 273 |
+
video_path: str,
|
| 274 |
+
frames: list[np.ndarray],
|
| 275 |
+
) -> tuple[float, list[dict]]:
|
| 276 |
+
"""
|
| 277 |
+
MFCC cross-correlation with lip-aperture motion curve (paper §III-A).
|
| 278 |
+
|
| 279 |
+
Extracts mono 16 kHz audio via ffmpeg, computes MFCC energy envelope,
|
| 280 |
+
computes per-frame lip-aperture from MediaPipe, resamples both to the
|
| 281 |
+
same length, and returns the Pearson correlation as an anomaly score.
|
| 282 |
+
|
| 283 |
+
Returns:
|
| 284 |
+
(sync_anomaly_score, timestamp_markers)
|
| 285 |
+
sync_anomaly_score: 0 = perfectly in sync, 1 = totally out of sync
|
| 286 |
+
timestamp_markers: list of {start_s, end_s, correlation} dicts for
|
| 287 |
+
segments where correlation < 0.2
|
| 288 |
+
"""
|
| 289 |
+
try:
|
| 290 |
+
import librosa # type: ignore
|
| 291 |
+
from scipy.stats import pearsonr # type: ignore
|
| 292 |
+
except ImportError as exc:
|
| 293 |
+
logger.warning("Audio analysis unavailable (missing dep): %s", exc)
|
| 294 |
+
return 0.35, []
|
| 295 |
+
|
| 296 |
+
audio_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 297 |
+
audio_path = audio_tmp.name
|
| 298 |
+
audio_tmp.close()
|
| 299 |
+
|
| 300 |
+
try:
|
| 301 |
+
cmd = [
|
| 302 |
+
"ffmpeg", "-i", video_path,
|
| 303 |
+
"-ac", "1", "-ar", "16000",
|
| 304 |
+
"-vn", # no video output
|
| 305 |
+
"-f", "wav",
|
| 306 |
+
audio_path,
|
| 307 |
+
"-y", "-loglevel", "error",
|
| 308 |
+
]
|
| 309 |
+
result = subprocess.run(cmd, capture_output=True, timeout=30)
|
| 310 |
+
if result.returncode != 0:
|
| 311 |
+
logger.debug("ffmpeg audio extract returned %d (no audio?)", result.returncode)
|
| 312 |
+
return 0.35, []
|
| 313 |
+
|
| 314 |
+
try:
|
| 315 |
+
y, sr = librosa.load(audio_path, sr=16000, mono=True)
|
| 316 |
+
except Exception as exc:
|
| 317 |
+
logger.warning("librosa load failed: %s", exc)
|
| 318 |
+
return 0.35, []
|
| 319 |
+
finally:
|
| 320 |
+
Path(audio_path).unlink(missing_ok=True)
|
| 321 |
+
|
| 322 |
+
if len(y) < sr * 0.5:
|
| 323 |
+
return 0.35, [] # less than 0.5 s of audio → inconclusive
|
| 324 |
+
|
| 325 |
+
# Audio energy envelope from MFCC
|
| 326 |
+
hop_length = 512
|
| 327 |
+
try:
|
| 328 |
+
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length)
|
| 329 |
+
audio_curve = np.mean(np.abs(mfcc), axis=0).astype(np.float32)
|
| 330 |
+
except Exception as exc:
|
| 331 |
+
logger.warning("MFCC computation failed: %s", exc)
|
| 332 |
+
return 0.35, []
|
| 333 |
+
|
| 334 |
+
# Lip-aperture curve from MediaPipe (inner upper lip=13, lower=14)
|
| 335 |
+
if _face_mesh is None:
|
| 336 |
+
return 0.35, []
|
| 337 |
+
|
| 338 |
+
lip_apertures: list[float] = []
|
| 339 |
+
for frame in frames:
|
| 340 |
+
try:
|
| 341 |
+
res = _face_mesh.process(frame)
|
| 342 |
+
if res.multi_face_landmarks:
|
| 343 |
+
lm = res.multi_face_landmarks[0].landmark
|
| 344 |
+
h, w = frame.shape[:2]
|
| 345 |
+
upper = np.array([lm[13].x * w, lm[13].y * h], dtype=np.float32)
|
| 346 |
+
lower = np.array([lm[14].x * w, lm[14].y * h], dtype=np.float32)
|
| 347 |
+
lip_apertures.append(float(np.linalg.norm(upper - lower)))
|
| 348 |
+
else:
|
| 349 |
+
lip_apertures.append(0.0)
|
| 350 |
+
except Exception:
|
| 351 |
+
lip_apertures.append(0.0)
|
| 352 |
+
|
| 353 |
+
if len(lip_apertures) < 4 or float(np.std(lip_apertures)) < 1e-6:
|
| 354 |
+
return 0.35, [] # static lip → can't measure sync
|
| 355 |
+
|
| 356 |
+
# Resample lip curve to match audio_curve length
|
| 357 |
+
lip_curve = np.array(lip_apertures, dtype=np.float32)
|
| 358 |
+
target_len = len(audio_curve)
|
| 359 |
+
lip_resampled = np.interp(
|
| 360 |
+
np.linspace(0, len(lip_curve) - 1, target_len),
|
| 361 |
+
np.arange(len(lip_curve)),
|
| 362 |
+
lip_curve,
|
| 363 |
)
|
| 364 |
|
| 365 |
+
if target_len < 4:
|
| 366 |
+
return 0.35, []
|
| 367 |
+
|
| 368 |
+
# Overall Pearson correlation
|
| 369 |
+
try:
|
| 370 |
+
r_overall, _ = pearsonr(audio_curve, lip_resampled)
|
| 371 |
+
except Exception:
|
| 372 |
+
r_overall = 0.0
|
| 373 |
+
|
| 374 |
+
# Map correlation → anomaly score
|
| 375 |
+
# Real speech: r typically > 0.3; deepfake: often < 0.1 or negative
|
| 376 |
+
sync_anomaly = float(np.clip((0.3 - float(r_overall)) / 0.5 + 0.35, 0.0, 1.0))
|
| 377 |
+
|
| 378 |
+
# Sliding-window timestamp markers for low-correlation segments
|
| 379 |
+
hop_s = hop_length / sr # seconds per MFCC frame
|
| 380 |
+
markers: list[dict] = []
|
| 381 |
+
window = max(10, target_len // 10)
|
| 382 |
+
stride = max(1, window // 2)
|
| 383 |
+
|
| 384 |
+
for i in range(0, target_len - window, stride):
|
| 385 |
+
seg_audio = audio_curve[i : i + window]
|
| 386 |
+
seg_lip = lip_resampled[i : i + window]
|
| 387 |
+
try:
|
| 388 |
+
r_seg, _ = pearsonr(seg_audio, seg_lip)
|
| 389 |
+
except Exception:
|
| 390 |
+
continue
|
| 391 |
+
if float(r_seg) < 0.2:
|
| 392 |
+
markers.append({
|
| 393 |
+
"start_s": round(i * hop_s, 2),
|
| 394 |
+
"end_s": round((i + window) * hop_s, 2),
|
| 395 |
+
"correlation": round(float(r_seg), 3),
|
| 396 |
+
})
|
| 397 |
+
|
| 398 |
+
return sync_anomaly, markers
|
| 399 |
+
|
| 400 |
def _embedding_variance(self, frames: list[np.ndarray]) -> float:
|
| 401 |
if _mtcnn is None or _resnet is None or _torch is None:
|
| 402 |
return 0.5
|
src/engines/fingerprint/engine.py
CHANGED
|
@@ -29,14 +29,15 @@ DETECTOR_CANDIDATES = [
|
|
| 29 |
]
|
| 30 |
|
| 31 |
GENERATOR_PROMPTS: dict[str, str] = {
|
| 32 |
-
"real":
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
|
|
|
| 40 |
}
|
| 41 |
|
| 42 |
FAKE_LABEL_KEYWORDS = (
|
|
@@ -68,6 +69,10 @@ _clip_model: Optional[CLIPModel] = None
|
|
| 68 |
_clip_processor: Optional[CLIPProcessor] = None
|
| 69 |
_loaded = False
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def _get_pipeline():
|
| 73 |
try:
|
|
@@ -195,7 +200,12 @@ class FingerprintEngine:
|
|
| 195 |
except Exception as exc:
|
| 196 |
logger.warning("Detector %s inference error: %s", model_id, _short_error(exc))
|
| 197 |
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
generator = self._attribute_generator(image, fake_score)
|
| 200 |
|
| 201 |
return EngineResult(
|
|
@@ -204,7 +214,7 @@ class FingerprintEngine:
|
|
| 204 |
confidence=float(fake_score),
|
| 205 |
attributed_generator=generator,
|
| 206 |
explanation=(
|
| 207 |
-
f"Ensemble
|
| 208 |
f"Generator attributed to: {generator}."
|
| 209 |
),
|
| 210 |
processing_time_ms=(time.perf_counter() - t0) * 1000,
|
|
@@ -212,7 +222,8 @@ class FingerprintEngine:
|
|
| 212 |
|
| 213 |
def _attribute_generator(self, image: Image.Image, fake_score: float) -> str:
|
| 214 |
if _clip_model is None or _clip_processor is None:
|
| 215 |
-
|
|
|
|
| 216 |
|
| 217 |
try:
|
| 218 |
texts = list(GENERATOR_PROMPTS.values())
|
|
@@ -225,18 +236,74 @@ class FingerprintEngine:
|
|
| 225 |
max_length=77,
|
| 226 |
)
|
| 227 |
with torch.no_grad():
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
probs = logits.softmax(dim=0).cpu().numpy()
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
if fake_score > 0.65 and generator == "real":
|
| 233 |
-
generator = "
|
| 234 |
if fake_score < 0.35 and generator != "real":
|
| 235 |
generator = "real"
|
| 236 |
return generator
|
| 237 |
except Exception as exc:
|
| 238 |
logger.warning("CLIP attribution error: %s", _short_error(exc))
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
def run_video(self, frames: list) -> EngineResult:
|
| 242 |
t0 = time.perf_counter()
|
|
|
|
| 29 |
]
|
| 30 |
|
| 31 |
GENERATOR_PROMPTS: dict[str, str] = {
|
| 32 |
+
"real": "a real photograph taken by a camera with natural lighting and film grain",
|
| 33 |
+
"sora": "a Sora text-to-video frame with temporal coherence and photorealistic lighting",
|
| 34 |
+
"runway": "a Runway Gen-2 frame with painterly dreamlike motion blur and color grading",
|
| 35 |
+
"wav2lip": "a Wav2Lip face-swap with sharp lip boundary artifacts and texture inconsistency at mouth edges",
|
| 36 |
+
"stable_diffusion": "an image generated by Stable Diffusion with painterly soft textures and dreamlike quality",
|
| 37 |
+
"sdxl": "an image generated by SDXL with high resolution detail, sharp edges and crisp textures",
|
| 38 |
+
"midjourney": "an image generated by Midjourney with cinematic dramatic lighting and extreme hyperdetail",
|
| 39 |
+
"dall_e": "an image generated by DALL-E with clean flat illustration style and smooth gradients",
|
| 40 |
+
"unknown_generative": "an AI-generated image with unidentifiable generator-specific artifacts and synthetic patterns",
|
| 41 |
}
|
| 42 |
|
| 43 |
FAKE_LABEL_KEYWORDS = (
|
|
|
|
| 69 |
_clip_processor: Optional[CLIPProcessor] = None
|
| 70 |
_loaded = False
|
| 71 |
|
| 72 |
+
# Thread-local storage: each request thread stores its last CLIP embedding here
|
| 73 |
+
# so the novelty detector can consume it without a second forward pass.
|
| 74 |
+
_thread_local = threading.local()
|
| 75 |
+
|
| 76 |
|
| 77 |
def _get_pipeline():
|
| 78 |
try:
|
|
|
|
| 200 |
except Exception as exc:
|
| 201 |
logger.warning("Detector %s inference error: %s", model_id, _short_error(exc))
|
| 202 |
|
| 203 |
+
ensemble_score = (weighted_fake / total_w) if total_w > 0 else 0.5
|
| 204 |
+
|
| 205 |
+
# DCT frequency band analysis (paper §III-B / Kim et al.)
|
| 206 |
+
dct_score = self._dct_frequency_score(image)
|
| 207 |
+
fake_score = float(np.clip(ensemble_score * 0.85 + dct_score * 0.15, 0.0, 1.0))
|
| 208 |
+
|
| 209 |
generator = self._attribute_generator(image, fake_score)
|
| 210 |
|
| 211 |
return EngineResult(
|
|
|
|
| 214 |
confidence=float(fake_score),
|
| 215 |
attributed_generator=generator,
|
| 216 |
explanation=(
|
| 217 |
+
f"Ensemble {ensemble_score:.2f} × 0.85 + DCT {dct_score:.2f} × 0.15 = {fake_score:.2f}. "
|
| 218 |
f"Generator attributed to: {generator}."
|
| 219 |
),
|
| 220 |
processing_time_ms=(time.perf_counter() - t0) * 1000,
|
|
|
|
| 222 |
|
| 223 |
def _attribute_generator(self, image: Image.Image, fake_score: float) -> str:
|
| 224 |
if _clip_model is None or _clip_processor is None:
|
| 225 |
+
_thread_local.last_clip_embedding = None
|
| 226 |
+
return "unknown_generative" if fake_score > 0.5 else "real"
|
| 227 |
|
| 228 |
try:
|
| 229 |
texts = list(GENERATOR_PROMPTS.values())
|
|
|
|
| 236 |
max_length=77,
|
| 237 |
)
|
| 238 |
with torch.no_grad():
|
| 239 |
+
outputs = _clip_model(**inputs)
|
| 240 |
+
logits = outputs.logits_per_image[0]
|
| 241 |
+
# Store image embedding for novelty detection
|
| 242 |
+
image_embeds = outputs.image_embeds.detach().cpu().numpy()[0]
|
| 243 |
+
_thread_local.last_clip_embedding = image_embeds
|
| 244 |
+
|
| 245 |
probs = logits.softmax(dim=0).cpu().numpy()
|
| 246 |
+
max_prob = float(np.max(probs))
|
| 247 |
+
|
| 248 |
+
# Low confidence attribution → unknown generator
|
| 249 |
+
if max_prob < 0.25:
|
| 250 |
+
generator = "unknown_generative"
|
| 251 |
+
else:
|
| 252 |
+
generator = list(GENERATOR_PROMPTS.keys())[int(np.argmax(probs))]
|
| 253 |
|
| 254 |
if fake_score > 0.65 and generator == "real":
|
| 255 |
+
generator = "unknown_generative"
|
| 256 |
if fake_score < 0.35 and generator != "real":
|
| 257 |
generator = "real"
|
| 258 |
return generator
|
| 259 |
except Exception as exc:
|
| 260 |
logger.warning("CLIP attribution error: %s", _short_error(exc))
|
| 261 |
+
_thread_local.last_clip_embedding = None
|
| 262 |
+
return "unknown_generative" if fake_score > 0.5 else "real"
|
| 263 |
+
|
| 264 |
+
def _dct_frequency_score(self, image: Image.Image) -> float:
|
| 265 |
+
"""
|
| 266 |
+
DCT frequency band analysis (paper §III-B).
|
| 267 |
+
High-frequency energy ratio is an anomaly signal: real photos follow
|
| 268 |
+
a predictable DCT energy roll-off; AI generators often deviate.
|
| 269 |
+
Returns float [0, 1] where higher = more anomalous.
|
| 270 |
+
"""
|
| 271 |
+
try:
|
| 272 |
+
from scipy.fft import dctn # type: ignore
|
| 273 |
+
|
| 274 |
+
gray = np.array(image.convert("L"), dtype=np.float32)
|
| 275 |
+
h, w = gray.shape
|
| 276 |
+
# Align to 8×8 block boundary (JPEG-DCT standard)
|
| 277 |
+
bh, bw = h - h % 8, w - w % 8
|
| 278 |
+
if bh < 8 or bw < 8:
|
| 279 |
+
return 0.3
|
| 280 |
+
crop = gray[:bh, :bw]
|
| 281 |
+
# Reshape into (n_blocks_h, n_blocks_w, 8, 8) then DCT each 8×8 block
|
| 282 |
+
blocks = crop.reshape(bh // 8, 8, bw // 8, 8).transpose(0, 2, 1, 3)
|
| 283 |
+
n_bh, n_bw = blocks.shape[:2]
|
| 284 |
+
|
| 285 |
+
dc_energy_total = 0.0
|
| 286 |
+
all_energy_total = 0.0
|
| 287 |
+
for bi in range(n_bh):
|
| 288 |
+
for bj in range(n_bw):
|
| 289 |
+
dct_block = dctn(blocks[bi, bj], norm="ortho")
|
| 290 |
+
dc_energy_total += float(dct_block[0, 0] ** 2)
|
| 291 |
+
all_energy_total += float(np.sum(dct_block ** 2))
|
| 292 |
+
|
| 293 |
+
if all_energy_total < 1e-9:
|
| 294 |
+
return 0.3
|
| 295 |
+
|
| 296 |
+
ac_ratio = 1.0 - (dc_energy_total / all_energy_total)
|
| 297 |
+
# Real photos: ac_ratio ≈ 0.80–0.90; AI images can deviate significantly
|
| 298 |
+
score = float(np.clip(abs(ac_ratio - 0.85) / 0.15, 0.0, 1.0))
|
| 299 |
+
return score
|
| 300 |
+
except Exception as exc:
|
| 301 |
+
logger.warning("DCT frequency score error: %s", _short_error(exc))
|
| 302 |
+
return 0.3
|
| 303 |
+
|
| 304 |
+
def get_last_clip_embedding(self) -> Optional[np.ndarray]:
|
| 305 |
+
"""Return the CLIP image embedding from the most recent run() call in this thread."""
|
| 306 |
+
return getattr(_thread_local, "last_clip_embedding", None)
|
| 307 |
|
| 308 |
def run_video(self, frames: list) -> EngineResult:
|
| 309 |
t0 = time.perf_counter()
|
src/engines/sstgnn/engine.py
CHANGED
|
@@ -303,6 +303,61 @@ class SSTGNNEngine:
|
|
| 303 |
logger.warning("Geometry score error: %s", exc)
|
| 304 |
return 0.3
|
| 305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
def run_video(self, frames: list[np.ndarray]) -> EngineResult:
|
| 307 |
t0 = time.perf_counter()
|
| 308 |
self._ensure()
|
|
@@ -319,14 +374,23 @@ class SSTGNNEngine:
|
|
| 319 |
|
| 320 |
sample = frames[::6] or [frames[0]]
|
| 321 |
results = [self.run(Image.fromarray(frame)) for frame in sample]
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
return EngineResult(
|
| 325 |
engine="sstgnn",
|
| 326 |
verdict="FAKE" if avg > 0.5 else "REAL",
|
| 327 |
confidence=avg,
|
| 328 |
attributed_generator=None,
|
| 329 |
-
explanation=
|
|
|
|
|
|
|
|
|
|
| 330 |
processing_time_ms=(time.perf_counter() - t0) * 1000,
|
| 331 |
)
|
| 332 |
|
|
|
|
| 303 |
logger.warning("Geometry score error: %s", exc)
|
| 304 |
return 0.3
|
| 305 |
|
| 306 |
+
def _temporal_fft_score(self, frames: list[np.ndarray]) -> float:
|
| 307 |
+
"""
|
| 308 |
+
Pixel-wise 1D FFT over the time axis (paper §III-C / Kim et al. [7]).
|
| 309 |
+
|
| 310 |
+
For each pixel position in a 32×32 downsampled grid, the 1D FFT is
|
| 311 |
+
computed across T frame samples. Real video concentrates energy in the
|
| 312 |
+
DC component (slow, smooth motion). Deepfakes often exhibit elevated
|
| 313 |
+
high-frequency temporal components due to frame-level inconsistencies.
|
| 314 |
+
|
| 315 |
+
Returns float [0, 1] where higher = more anomalous.
|
| 316 |
+
"""
|
| 317 |
+
try:
|
| 318 |
+
import cv2 # type: ignore
|
| 319 |
+
|
| 320 |
+
if len(frames) < 8:
|
| 321 |
+
return 0.3
|
| 322 |
+
|
| 323 |
+
# Sample up to 32 frames evenly
|
| 324 |
+
step = max(1, len(frames) // 32)
|
| 325 |
+
sampled = frames[::step][:32]
|
| 326 |
+
if len(sampled) < 4:
|
| 327 |
+
return 0.3
|
| 328 |
+
|
| 329 |
+
# Downsample each frame to 32×32 grayscale float32
|
| 330 |
+
gray_stack = np.array(
|
| 331 |
+
[
|
| 332 |
+
cv2.resize(
|
| 333 |
+
cv2.cvtColor(f, cv2.COLOR_RGB2GRAY)
|
| 334 |
+
if (f.ndim == 3 and f.shape[2] >= 3)
|
| 335 |
+
else f[:, :, 0] if f.ndim == 3 else f,
|
| 336 |
+
(32, 32),
|
| 337 |
+
).astype(np.float32)
|
| 338 |
+
for f in sampled
|
| 339 |
+
]
|
| 340 |
+
) # shape: (T, 32, 32)
|
| 341 |
+
|
| 342 |
+
# 1D real FFT along time axis
|
| 343 |
+
fft_result = np.fft.rfft(gray_stack, axis=0) # (T//2+1, 32, 32)
|
| 344 |
+
power = np.abs(fft_result) ** 2 # power spectrum
|
| 345 |
+
|
| 346 |
+
dc_power = power[0] # (32, 32)
|
| 347 |
+
total_power = np.sum(power, axis=0) + 1e-9 # (32, 32)
|
| 348 |
+
hf_ratio = 1.0 - (dc_power / total_power) # per-pixel HF ratio
|
| 349 |
+
mean_hf = float(np.mean(hf_ratio))
|
| 350 |
+
|
| 351 |
+
# Real video: mean_hf ≈ 0.20–0.40 (most energy in slow motion).
|
| 352 |
+
# Deepfakes deviate in either direction (flickering >0.55 or
|
| 353 |
+
# unnaturally smooth <0.10). Centre of normal range = 0.30.
|
| 354 |
+
score = float(np.clip(abs(mean_hf - 0.30) / 0.25, 0.0, 1.0))
|
| 355 |
+
return score
|
| 356 |
+
|
| 357 |
+
except Exception as exc:
|
| 358 |
+
logger.warning("Temporal FFT score error: %s", _short_error(exc))
|
| 359 |
+
return 0.3
|
| 360 |
+
|
| 361 |
def run_video(self, frames: list[np.ndarray]) -> EngineResult:
|
| 362 |
t0 = time.perf_counter()
|
| 363 |
self._ensure()
|
|
|
|
| 374 |
|
| 375 |
sample = frames[::6] or [frames[0]]
|
| 376 |
results = [self.run(Image.fromarray(frame)) for frame in sample]
|
| 377 |
+
cnn_geo_avg = float(np.mean([r.confidence for r in results]))
|
| 378 |
+
|
| 379 |
+
# Pixel-wise temporal FFT (paper §III-C / Kim et al. [7])
|
| 380 |
+
fft_score = self._temporal_fft_score(frames)
|
| 381 |
+
|
| 382 |
+
# Final: CNN+geometry 80%, temporal FFT 20%
|
| 383 |
+
avg = float(np.clip(cnn_geo_avg * 0.80 + fft_score * 0.20, 0.0, 1.0))
|
| 384 |
|
| 385 |
return EngineResult(
|
| 386 |
engine="sstgnn",
|
| 387 |
verdict="FAKE" if avg > 0.5 else "REAL",
|
| 388 |
confidence=avg,
|
| 389 |
attributed_generator=None,
|
| 390 |
+
explanation=(
|
| 391 |
+
f"CNN+geometry avg {cnn_geo_avg:.2f} over {len(sample)} frames, "
|
| 392 |
+
f"temporal FFT anomaly {fft_score:.2f}."
|
| 393 |
+
),
|
| 394 |
processing_time_ms=(time.perf_counter() - t0) * 1000,
|
| 395 |
)
|
| 396 |
|
src/fusion/fuser.py
CHANGED
|
@@ -1,27 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import numpy as np
|
| 4 |
|
| 5 |
from src.types import DetectionResponse, EngineResult
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
"
|
|
|
|
|
|
|
| 11 |
}
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
"
|
| 15 |
-
"
|
| 16 |
-
"sstgnn": 0.20,
|
| 17 |
}
|
| 18 |
|
| 19 |
-
|
|
|
|
| 20 |
"fingerprint": 1,
|
| 21 |
-
"sstgnn":
|
| 22 |
-
"coherence":
|
| 23 |
}
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def _normalize_generator(value: str | None) -> str:
|
| 27 |
if not value:
|
|
@@ -29,31 +49,103 @@ def _normalize_generator(value: str | None) -> str:
|
|
| 29 |
return str(value).strip().lower().replace(" ", "_")
|
| 30 |
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def fuse(results: list[EngineResult], is_video: bool = False) -> tuple[str, float, str]:
|
| 33 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
if not active:
|
| 39 |
-
return "UNKNOWN", 0.5, "
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
wr = sum(
|
| 47 |
-
(1.0 - result.confidence) * weights.get(result.engine, 0.1)
|
| 48 |
-
for result in active
|
| 49 |
-
if result.verdict == "REAL"
|
| 50 |
-
)
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
verdict = "FAKE" if fake_prob > 0.5 else "REAL"
|
| 55 |
confidence = fake_prob if verdict == "FAKE" else (1.0 - fake_prob)
|
| 56 |
|
|
|
|
| 57 |
generator = "real"
|
| 58 |
if verdict == "FAKE":
|
| 59 |
for result in sorted(active, key=lambda r: ATTRIBUTION_PRIORITY.get(r.engine, 9)):
|
|
@@ -62,9 +154,9 @@ def fuse(results: list[EngineResult], is_video: bool = False) -> tuple[str, floa
|
|
| 62 |
generator = candidate
|
| 63 |
break
|
| 64 |
if generator == "real":
|
| 65 |
-
generator = "
|
| 66 |
|
| 67 |
-
return verdict, confidence, generator
|
| 68 |
|
| 69 |
|
| 70 |
class Fuser:
|
|
@@ -80,7 +172,7 @@ class Fuser:
|
|
| 80 |
return DetectionResponse(
|
| 81 |
verdict="REAL",
|
| 82 |
confidence=0.5,
|
| 83 |
-
attributed_generator="
|
| 84 |
explanation="No engine results available.",
|
| 85 |
processing_time_ms=round(total_ms, 2),
|
| 86 |
engine_breakdown=[],
|
|
@@ -95,7 +187,9 @@ class Fuser:
|
|
| 95 |
f"{result.engine}:{result.verdict}({result.confidence:.2f})"
|
| 96 |
for result in results
|
| 97 |
)
|
| 98 |
-
explanation =
|
|
|
|
|
|
|
| 99 |
|
| 100 |
return DetectionResponse(
|
| 101 |
verdict=verdict,
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
src/fusion/fuser.py — Multi-engine evidence fusion.
|
| 3 |
+
|
| 4 |
+
Implements Dempster-Shafer (DS) evidence theory combination of the three
|
| 5 |
+
detection engine outputs (paper §III-E / Module 5).
|
| 6 |
+
|
| 7 |
+
DS replaces the previous simple weighted average. Each engine produces a
|
| 8 |
+
Basic Probability Assignment (BPA) over {FAKE, REAL, Θ} where Θ is the
|
| 9 |
+
set of all hypotheses (total ignorance). DS combination normalises away
|
| 10 |
+
the conflict between contradictory masses, yielding a combined BPA that
|
| 11 |
+
reflects consensus while respecting uncertainty.
|
| 12 |
+
|
| 13 |
+
The final confidence is derived via the pignistic probability transform
|
| 14 |
+
(Smets), which distributes the ignorance mass equally between FAKE and REAL.
|
| 15 |
+
"""
|
| 16 |
from __future__ import annotations
|
| 17 |
|
| 18 |
import numpy as np
|
| 19 |
|
| 20 |
from src.types import DetectionResponse, EngineResult
|
| 21 |
|
| 22 |
+
# Engine reliability weights used to build each engine's BPA.
|
| 23 |
+
# Higher weight → engine commits more mass to its verdict, less to Θ.
|
| 24 |
+
ENGINE_RELIABILITY: dict[str, float] = {
|
| 25 |
+
"fingerprint": 0.70,
|
| 26 |
+
"coherence": 0.65,
|
| 27 |
+
"sstgnn": 0.60,
|
| 28 |
}
|
| 29 |
+
ENGINE_RELIABILITY_VIDEO: dict[str, float] = {
|
| 30 |
+
"fingerprint": 0.55,
|
| 31 |
+
"coherence": 0.75,
|
| 32 |
+
"sstgnn": 0.65,
|
|
|
|
| 33 |
}
|
| 34 |
|
| 35 |
+
# Attribution priority: which engine's generator label is most trusted
|
| 36 |
+
ATTRIBUTION_PRIORITY: dict[str, int] = {
|
| 37 |
"fingerprint": 1,
|
| 38 |
+
"sstgnn": 2,
|
| 39 |
+
"coherence": 3,
|
| 40 |
}
|
| 41 |
|
| 42 |
+
# Type alias for a Basic Probability Assignment over {FAKE, REAL, Θ}
|
| 43 |
+
_BPA = dict[str, float]
|
| 44 |
+
|
| 45 |
|
| 46 |
def _normalize_generator(value: str | None) -> str:
|
| 47 |
if not value:
|
|
|
|
| 49 |
return str(value).strip().lower().replace(" ", "_")
|
| 50 |
|
| 51 |
|
| 52 |
+
def _engine_to_bpa(result: EngineResult, is_video: bool = False) -> _BPA:
|
| 53 |
+
"""
|
| 54 |
+
Convert an EngineResult into a Basic Probability Assignment.
|
| 55 |
+
|
| 56 |
+
The engine reliability weight (w) determines how much mass is committed
|
| 57 |
+
to the engine's verdict vs. left as ignorance (Θ).
|
| 58 |
+
|
| 59 |
+
BPA structure:
|
| 60 |
+
m({FAKE}) + m({REAL}) + m(Θ) = 1.0
|
| 61 |
+
"""
|
| 62 |
+
weights = ENGINE_RELIABILITY_VIDEO if is_video else ENGINE_RELIABILITY
|
| 63 |
+
w = weights.get(result.engine, 0.50)
|
| 64 |
+
c = float(result.confidence)
|
| 65 |
+
|
| 66 |
+
if result.verdict == "UNKNOWN":
|
| 67 |
+
return {"FAKE": 0.0, "REAL": 0.0, "Θ": 1.0}
|
| 68 |
+
if result.verdict == "FAKE":
|
| 69 |
+
return {
|
| 70 |
+
"FAKE": c * w,
|
| 71 |
+
"REAL": (1.0 - c) * w,
|
| 72 |
+
"Θ": 1.0 - w,
|
| 73 |
+
}
|
| 74 |
+
# verdict == "REAL"
|
| 75 |
+
return {
|
| 76 |
+
"REAL": c * w,
|
| 77 |
+
"FAKE": (1.0 - c) * w,
|
| 78 |
+
"Θ": 1.0 - w,
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _ds_combine(m1: _BPA, m2: _BPA) -> _BPA:
|
| 83 |
+
"""
|
| 84 |
+
Dempster's combination rule for two BPAs over {FAKE, REAL, Θ}.
|
| 85 |
+
|
| 86 |
+
K = conflict = Σ_{A∩B=∅} m1(A)·m2(B)
|
| 87 |
+
m12(C) = Σ_{A∩B=C} m1(A)·m2(B) / (1 - K) for C ≠ ∅
|
| 88 |
+
"""
|
| 89 |
+
# Conflict mass: FAKE ∩ REAL = ∅, so conflict = FAKE×REAL + REAL×FAKE
|
| 90 |
+
K = m1["FAKE"] * m2["REAL"] + m1["REAL"] * m2["FAKE"]
|
| 91 |
+
|
| 92 |
+
# Unnormalised joint masses
|
| 93 |
+
raw_fake = (
|
| 94 |
+
m1["FAKE"] * m2["FAKE"] # FAKE ∩ FAKE = FAKE
|
| 95 |
+
+ m1["FAKE"] * m2["Θ"] # FAKE ∩ Θ = FAKE
|
| 96 |
+
+ m1["Θ"] * m2["FAKE"] # Θ ∩ FAKE = FAKE
|
| 97 |
+
)
|
| 98 |
+
raw_real = (
|
| 99 |
+
m1["REAL"] * m2["REAL"]
|
| 100 |
+
+ m1["REAL"] * m2["Θ"]
|
| 101 |
+
+ m1["Θ"] * m2["REAL"]
|
| 102 |
+
)
|
| 103 |
+
raw_theta = m1["Θ"] * m2["Θ"] # Θ ∩ Θ = Θ
|
| 104 |
+
|
| 105 |
+
norm = 1.0 - K
|
| 106 |
+
if norm < 1e-9:
|
| 107 |
+
# Total conflict → maximum uncertainty
|
| 108 |
+
return {"FAKE": 0.5, "REAL": 0.5, "Θ": 0.0}
|
| 109 |
+
|
| 110 |
+
return {
|
| 111 |
+
"FAKE": raw_fake / norm,
|
| 112 |
+
"REAL": raw_real / norm,
|
| 113 |
+
"Θ": raw_theta / norm,
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
|
| 117 |
def fuse(results: list[EngineResult], is_video: bool = False) -> tuple[str, float, str]:
|
| 118 |
+
"""
|
| 119 |
+
Dempster-Shafer fusion of engine results.
|
| 120 |
+
|
| 121 |
+
Returns (verdict, confidence_for_verdict, attributed_generator).
|
| 122 |
|
| 123 |
+
Confidence is derived via the pignistic probability transform (Smets 1990):
|
| 124 |
+
ignorance mass Θ is split equally between FAKE and REAL before thresholding.
|
| 125 |
+
This avoids overconfident verdicts when engines disagree.
|
| 126 |
+
"""
|
| 127 |
+
active = [r for r in results if r.verdict != "UNKNOWN"]
|
| 128 |
|
| 129 |
if not active:
|
| 130 |
+
return "UNKNOWN", 0.5, "unknown_generative"
|
| 131 |
|
| 132 |
+
# Build and combine BPAs iteratively
|
| 133 |
+
bpas = [_engine_to_bpa(r, is_video) for r in active]
|
| 134 |
+
combined = bpas[0]
|
| 135 |
+
for bpa in bpas[1:]:
|
| 136 |
+
combined = _ds_combine(combined, bpa)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
+
# Pignistic transform: distribute Θ mass equally
|
| 139 |
+
theta = combined.get("Θ", 0.0)
|
| 140 |
+
pign_fake = combined["FAKE"] + theta / 2.0
|
| 141 |
+
pign_real = combined["REAL"] + theta / 2.0
|
| 142 |
+
pign_total = pign_fake + pign_real + 1e-9
|
| 143 |
+
|
| 144 |
+
fake_prob = float(np.clip(pign_fake / pign_total, 0.0, 1.0))
|
| 145 |
verdict = "FAKE" if fake_prob > 0.5 else "REAL"
|
| 146 |
confidence = fake_prob if verdict == "FAKE" else (1.0 - fake_prob)
|
| 147 |
|
| 148 |
+
# Generator attribution: highest-priority engine with a non-real label
|
| 149 |
generator = "real"
|
| 150 |
if verdict == "FAKE":
|
| 151 |
for result in sorted(active, key=lambda r: ATTRIBUTION_PRIORITY.get(r.engine, 9)):
|
|
|
|
| 154 |
generator = candidate
|
| 155 |
break
|
| 156 |
if generator == "real":
|
| 157 |
+
generator = "unknown_generative"
|
| 158 |
|
| 159 |
+
return verdict, float(np.clip(confidence, 0.0, 1.0)), generator
|
| 160 |
|
| 161 |
|
| 162 |
class Fuser:
|
|
|
|
| 172 |
return DetectionResponse(
|
| 173 |
verdict="REAL",
|
| 174 |
confidence=0.5,
|
| 175 |
+
attributed_generator="unknown_generative",
|
| 176 |
explanation="No engine results available.",
|
| 177 |
processing_time_ms=round(total_ms, 2),
|
| 178 |
engine_breakdown=[],
|
|
|
|
| 187 |
f"{result.engine}:{result.verdict}({result.confidence:.2f})"
|
| 188 |
for result in results
|
| 189 |
)
|
| 190 |
+
explanation = (
|
| 191 |
+
f"Dempster-Shafer fusion ({media_type}) from engines: {summary}."
|
| 192 |
+
)
|
| 193 |
|
| 194 |
return DetectionResponse(
|
| 195 |
verdict=verdict,
|
src/types.py
CHANGED
|
@@ -12,27 +12,29 @@ from pydantic import BaseModel, field_validator
|
|
| 12 |
|
| 13 |
|
| 14 |
class GeneratorLabel(str, Enum):
|
| 15 |
-
"""Generator attribution labels
|
| 16 |
|
| 17 |
real = "real"
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
stable_diffusion = "stable_diffusion"
|
|
|
|
| 20 |
midjourney = "midjourney"
|
| 21 |
dall_e = "dall_e"
|
| 22 |
-
|
| 23 |
-
firefly = "firefly"
|
| 24 |
-
imagen = "imagen"
|
| 25 |
|
| 26 |
|
| 27 |
GENERATOR_INDEX_TO_LABEL: dict[int, GeneratorLabel] = {
|
| 28 |
0: GeneratorLabel.real,
|
| 29 |
-
1: GeneratorLabel.
|
| 30 |
-
2: GeneratorLabel.
|
| 31 |
-
3: GeneratorLabel.
|
| 32 |
-
4: GeneratorLabel.
|
| 33 |
-
5: GeneratorLabel.
|
| 34 |
-
6: GeneratorLabel.
|
| 35 |
-
7: GeneratorLabel.
|
|
|
|
| 36 |
}
|
| 37 |
|
| 38 |
|
|
@@ -46,6 +48,10 @@ class EngineResult(BaseModel):
|
|
| 46 |
explanation: str = ""
|
| 47 |
processing_time_ms: float = 0.0
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
@field_validator("confidence")
|
| 50 |
@classmethod
|
| 51 |
def confidence_in_range(cls, value: float) -> float:
|
|
@@ -71,6 +77,13 @@ class DetectionResponse(BaseModel):
|
|
| 71 |
processing_time_ms: float
|
| 72 |
engine_breakdown: list[EngineResult]
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
# Optional explainability metadata
|
| 75 |
clarity_score: Optional[float] = None
|
| 76 |
saliency_map_url: Optional[str] = None
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
class GeneratorLabel(str, Enum):
|
| 15 |
+
"""Generator attribution labels — aligned to paper's 8-generator taxonomy."""
|
| 16 |
|
| 17 |
real = "real"
|
| 18 |
+
sora = "sora"
|
| 19 |
+
runway = "runway"
|
| 20 |
+
wav2lip = "wav2lip"
|
| 21 |
stable_diffusion = "stable_diffusion"
|
| 22 |
+
sdxl = "sdxl"
|
| 23 |
midjourney = "midjourney"
|
| 24 |
dall_e = "dall_e"
|
| 25 |
+
unknown_generative = "unknown_generative"
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
GENERATOR_INDEX_TO_LABEL: dict[int, GeneratorLabel] = {
|
| 29 |
0: GeneratorLabel.real,
|
| 30 |
+
1: GeneratorLabel.sora,
|
| 31 |
+
2: GeneratorLabel.runway,
|
| 32 |
+
3: GeneratorLabel.wav2lip,
|
| 33 |
+
4: GeneratorLabel.stable_diffusion,
|
| 34 |
+
5: GeneratorLabel.sdxl,
|
| 35 |
+
6: GeneratorLabel.midjourney,
|
| 36 |
+
7: GeneratorLabel.dall_e,
|
| 37 |
+
8: GeneratorLabel.unknown_generative,
|
| 38 |
}
|
| 39 |
|
| 40 |
|
|
|
|
| 48 |
explanation: str = ""
|
| 49 |
processing_time_ms: float = 0.0
|
| 50 |
|
| 51 |
+
# Audio coherence sub-scores (populated by CoherenceEngine on video input)
|
| 52 |
+
audio_sync_score: Optional[float] = None
|
| 53 |
+
timestamp_markers: list[dict] = []
|
| 54 |
+
|
| 55 |
@field_validator("confidence")
|
| 56 |
@classmethod
|
| 57 |
def confidence_in_range(cls, value: float) -> float:
|
|
|
|
| 77 |
processing_time_ms: float
|
| 78 |
engine_breakdown: list[EngineResult]
|
| 79 |
|
| 80 |
+
# Module 4 — Continual Learning novelty signal
|
| 81 |
+
novelty_score: Optional[float] = None
|
| 82 |
+
|
| 83 |
+
# Module 1 — Audio lip-sync coherence sub-scores
|
| 84 |
+
audio_sync_score: Optional[float] = None
|
| 85 |
+
timestamp_markers: list[dict] = []
|
| 86 |
+
|
| 87 |
# Optional explainability metadata
|
| 88 |
clarity_score: Optional[float] = None
|
| 89 |
saliency_map_url: Optional[str] = None
|