akagtag commited on
Commit
8cf40cb
·
1 Parent(s): 7250f83

feat: Enhance generator detection and attribution mechanisms

Browse files

- Introduced NoveltyDetector for detecting unseen generators using a CLIP embedding ring buffer and IsolationForest.
- Added GeneratorRegistry for monitoring generator performance and retention, flagging those below a defined threshold.
- Updated FingerprintEngine to include DCT frequency analysis and improved generator attribution logic.
- Enhanced CoherenceEngine with audio lip-sync analysis, integrating it into the video processing pipeline.
- Implemented Dempster-Shafer evidence fusion in Fuser for more robust verdicts based on multiple engine outputs.
- Revised generator labels and their corresponding prompts to align with the updated taxonomy.
- Added support for audio coherence sub-scores and timestamp markers in detection responses.

src/api/main.py CHANGED
@@ -17,6 +17,8 @@ from fastapi.middleware.cors import CORSMiddleware
17
  from fastapi.responses import HTMLResponse
18
  from PIL import ExifTags, Image
19
 
 
 
20
  from src.engines.coherence.engine import CoherenceEngine
21
  from src.engines.fingerprint.engine import FingerprintEngine
22
  from src.engines.sstgnn.engine import SSTGNNEngine
@@ -63,6 +65,10 @@ _co = CoherenceEngine()
63
  _st = SSTGNNEngine()
64
  _hf = HFInferenceClient()
65
 
 
 
 
 
66
  MAX_IMAGE_MB = int(os.environ.get("MAX_IMAGE_SIZE_MB", 20))
67
  MAX_VIDEO_MB = int(os.environ.get("MAX_VIDEO_SIZE_MB", 100))
68
  MAX_FRAMES = int(os.environ.get("MAX_VIDEO_FRAMES", 300))
@@ -72,13 +78,14 @@ VIDEO_TYPES = {"video/mp4", "video/quicktime", "video/x-msvideo", "video/webm",
72
 
73
  SUPPORTED_GENERATORS = [
74
  "real",
75
- "unknown_gan",
 
 
76
  "stable_diffusion",
 
77
  "midjourney",
78
  "dall_e",
79
- "flux",
80
- "firefly",
81
- "imagen",
82
  ]
83
 
84
  SYNTHETIC_KEYWORDS = (
@@ -174,7 +181,7 @@ def _apply_metadata_keyword_signal(
174
  engine="metadata_signal",
175
  verdict="FAKE",
176
  confidence=0.98,
177
- attributed_generator="unknown_gan",
178
  explanation=f"Filename/metadata contains synthetic keyword(s): {', '.join(hits)}.",
179
  processing_time_ms=0.0,
180
  )
@@ -189,7 +196,7 @@ def _apply_metadata_keyword_signal(
189
  flagged.verdict = "FAKE"
190
  flagged.confidence = max(flagged.confidence, 0.85)
191
  if flagged.attributed_generator == "real":
192
- flagged.attributed_generator = "unknown_gan"
193
 
194
  return flagged
195
 
@@ -342,7 +349,7 @@ def _hf_generator_label(preds: list[dict], verdict: str) -> str:
342
  continue
343
  if candidate.replace("_", " ") in labels or candidate in labels:
344
  return candidate
345
- return "unknown_gan"
346
 
347
 
348
  def _build_hf_response(preds: list[dict], elapsed_ms: float, media_type: str) -> DetectionResponse:
 
17
  from fastapi.responses import HTMLResponse
18
  from PIL import ExifTags, Image
19
 
20
+ from src.continual.novelty_detector import NoveltyDetector
21
+ from src.continual.registry import GeneratorRegistry
22
  from src.engines.coherence.engine import CoherenceEngine
23
  from src.engines.fingerprint.engine import FingerprintEngine
24
  from src.engines.sstgnn.engine import SSTGNNEngine
 
65
  _st = SSTGNNEngine()
66
  _hf = HFInferenceClient()
67
 
68
+ # Module 4 — Continual Learning backbone (paper §III-D)
69
+ _novelty_detector = NoveltyDetector(buffer_size=500, min_fit_size=50, refit_interval=25)
70
+ _generator_registry = GeneratorRegistry()
71
+
72
  MAX_IMAGE_MB = int(os.environ.get("MAX_IMAGE_SIZE_MB", 20))
73
  MAX_VIDEO_MB = int(os.environ.get("MAX_VIDEO_SIZE_MB", 100))
74
  MAX_FRAMES = int(os.environ.get("MAX_VIDEO_FRAMES", 300))
 
78
 
79
  SUPPORTED_GENERATORS = [
80
  "real",
81
+ "sora",
82
+ "runway",
83
+ "wav2lip",
84
  "stable_diffusion",
85
+ "sdxl",
86
  "midjourney",
87
  "dall_e",
88
+ "unknown_generative",
 
 
89
  ]
90
 
91
  SYNTHETIC_KEYWORDS = (
 
181
  engine="metadata_signal",
182
  verdict="FAKE",
183
  confidence=0.98,
184
+ attributed_generator="unknown_generative",
185
  explanation=f"Filename/metadata contains synthetic keyword(s): {', '.join(hits)}.",
186
  processing_time_ms=0.0,
187
  )
 
196
  flagged.verdict = "FAKE"
197
  flagged.confidence = max(flagged.confidence, 0.85)
198
  if flagged.attributed_generator == "real":
199
+ flagged.attributed_generator = "unknown_generative"
200
 
201
  return flagged
202
 
 
349
  continue
350
  if candidate.replace("_", " ") in labels or candidate in labels:
351
  return candidate
352
+ return "unknown_generative"
353
 
354
 
355
  def _build_hf_response(preds: list[dict], elapsed_ms: float, media_type: str) -> DetectionResponse:
src/continual/novelty_detector.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ src/continual/novelty_detector.py — Novel-generator detection via CLIP ring buffer.
3
+
4
+ Implements Epic 4 of the paper: a CLIP embedding ring buffer with an IsolationForest
5
+ that detects when an input resembles a generator not seen during training.
6
+
7
+ Architecture (paper Fig. 1, Epic 4):
8
+ CLIP embedding ring buffer → IsolationForest → novelty_score [0, 1]
9
+
10
+ A high novelty_score indicates the input may come from a generator not yet
11
+ indexed by the fingerprint module — this is the anti-Detector-Rot signal.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ import threading
17
+ from collections import deque
18
+ from typing import Optional
19
+
20
+ import numpy as np
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class NoveltyDetector:
26
+ """
27
+ CLIP embedding ring buffer + IsolationForest novelty detector.
28
+
29
+ After at least `min_fit_size` embeddings accumulate, an IsolationForest
30
+ is fitted on the buffer. Every subsequent embedding receives a novelty
31
+ score in [0, 1]. 0.5 is returned until the forest is ready.
32
+
33
+ Thread-safe: all public methods acquire the internal lock.
34
+
35
+ Parameters
36
+ ----------
37
+ buffer_size: Maximum embeddings to retain (FIFO eviction).
38
+ min_fit_size: Minimum buffer size before the first forest fit.
39
+ refit_interval: How many updates between successive refits.
40
+ contamination: Expected outlier fraction (passed to IsolationForest).
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ buffer_size: int = 500,
46
+ min_fit_size: int = 50,
47
+ refit_interval: int = 25,
48
+ contamination: float = 0.1,
49
+ ) -> None:
50
+ self._buffer: deque[np.ndarray] = deque(maxlen=buffer_size)
51
+ self._forest = None
52
+ self._lock = threading.Lock()
53
+ self._min_fit_size = min_fit_size
54
+ self._refit_interval = refit_interval
55
+ self._contamination = contamination
56
+ self._n_updates = 0
57
+
58
+ # ------------------------------------------------------------------
59
+ # Public
60
+ # ------------------------------------------------------------------
61
+
62
+ def update(self, clip_embedding: np.ndarray) -> float:
63
+ """
64
+ Add `clip_embedding` to the ring buffer and return a novelty score.
65
+
66
+ Returns 0.5 until the buffer has at least `min_fit_size` samples.
67
+
68
+ Parameters
69
+ ----------
70
+ clip_embedding: 1-D (or any shape, will be flattened) float32 array
71
+ from CLIP's image encoder.
72
+
73
+ Returns
74
+ -------
75
+ novelty_score: float in [0, 1]. Higher = more novel (unseen generator).
76
+ """
77
+ with self._lock:
78
+ emb = clip_embedding.flatten().astype(np.float32)
79
+ self._buffer.append(emb)
80
+ self._n_updates += 1
81
+
82
+ n = len(self._buffer)
83
+ if n >= self._min_fit_size and self._n_updates % self._refit_interval == 0:
84
+ self._refit()
85
+
86
+ if self._forest is None or n < self._min_fit_size:
87
+ return 0.5
88
+
89
+ try:
90
+ # score_samples: more negative = more anomalous
91
+ raw = float(self._forest.score_samples([emb])[0])
92
+ # Typical range: [-0.5, 0.0]. Map to [0, 1].
93
+ novelty = float(np.clip((-raw - 0.1) / 0.4, 0.0, 1.0))
94
+ return novelty
95
+ except Exception as exc:
96
+ logger.warning("IsolationForest scoring error: %s", exc)
97
+ return 0.5
98
+
99
+ @property
100
+ def buffer_size(self) -> int:
101
+ """Current number of embeddings in the ring buffer."""
102
+ return len(self._buffer)
103
+
104
+ @property
105
+ def is_ready(self) -> bool:
106
+ """True once the IsolationForest has been fitted at least once."""
107
+ return self._forest is not None and len(self._buffer) >= self._min_fit_size
108
+
109
+ # ------------------------------------------------------------------
110
+ # Private
111
+ # ------------------------------------------------------------------
112
+
113
+ def _refit(self) -> None:
114
+ """Fit a fresh IsolationForest on all buffered embeddings."""
115
+ try:
116
+ from sklearn.ensemble import IsolationForest # type: ignore
117
+
118
+ X = np.array(list(self._buffer), dtype=np.float32)
119
+ forest = IsolationForest(
120
+ contamination=self._contamination,
121
+ random_state=42,
122
+ n_estimators=50, # lightweight — no GPU required
123
+ )
124
+ forest.fit(X)
125
+ self._forest = forest
126
+ logger.debug(
127
+ "NoveltyDetector: refitted IsolationForest on %d embeddings",
128
+ len(self._buffer),
129
+ )
130
+ except Exception as exc:
131
+ logger.warning("NoveltyDetector refit failed: %s", exc)
src/continual/registry.py CHANGED
@@ -22,6 +22,7 @@ from __future__ import annotations
22
  import json
23
  import logging
24
  import os
 
25
  from datetime import datetime, timezone
26
  from pathlib import Path
27
  from typing import Any
@@ -119,3 +120,116 @@ class TaskRegistry:
119
  json.dumps(tasks, indent=2, default=str),
120
  encoding="utf-8",
121
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  import json
23
  import logging
24
  import os
25
+ import threading
26
  from datetime import datetime, timezone
27
  from pathlib import Path
28
  from typing import Any
 
120
  json.dumps(tasks, indent=2, default=str),
121
  encoding="utf-8",
122
  )
123
+
124
+
125
+ # ---------------------------------------------------------------------------
126
+ # GeneratorRegistry — retention-aware live monitoring registry
127
+ # ---------------------------------------------------------------------------
128
+
129
+ _DEFAULT_RETENTION_PATH = Path(
130
+ os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
131
+ ) / "generator_registry.json"
132
+
133
+ RETENTION_THRESHOLD = 0.85 # paper §III-D: flag if retention drops below 85%
134
+
135
+
136
+ class GeneratorRegistry:
137
+ """
138
+ Live monitoring registry for per-generator knowledge retention.
139
+
140
+ Tracks how well the system performs on each known generator class and
141
+ flags generators whose accuracy drops below the 85% retention threshold
142
+ (paper §III-D). Designed to be called by APScheduler every N hours.
143
+
144
+ The registry is persisted to a JSON file at MODEL_CACHE_DIR/generator_registry.json
145
+ and survives container restarts on HF Spaces' /data volume.
146
+
147
+ Parameters
148
+ ----------
149
+ path:
150
+ Path to the JSON persistence file. Defaults to MODEL_CACHE_DIR/generator_registry.json.
151
+ """
152
+
153
+ def __init__(self, path: Path | str | None = None) -> None:
154
+ self._path = Path(path) if path else _DEFAULT_RETENTION_PATH
155
+ self._lock = threading.Lock()
156
+
157
+ # ------------------------------------------------------------------
158
+ # Retention tracking
159
+ # ------------------------------------------------------------------
160
+
161
+ def record_prediction(self, generator_label: str, correct: bool) -> None:
162
+ """
163
+ Record whether a prediction for `generator_label` was correct.
164
+
165
+ Updates a running accuracy estimate using an exponential moving average.
166
+ """
167
+ with self._lock:
168
+ data = self._load()
169
+ entry = data.setdefault(generator_label, {"ema_accuracy": 1.0, "n_samples": 0, "flagged": False})
170
+ n = entry["n_samples"]
171
+ alpha = min(0.1, 2.0 / (n + 2)) # EMA decay; stabilises after ~20 samples
172
+ entry["ema_accuracy"] = (1 - alpha) * entry["ema_accuracy"] + alpha * (1.0 if correct else 0.0)
173
+ entry["n_samples"] += 1
174
+ entry["flagged"] = entry["ema_accuracy"] < RETENTION_THRESHOLD
175
+ self._save(data)
176
+
177
+ def retention_scores(self) -> dict[str, float]:
178
+ """Return {generator_label: ema_accuracy} for all tracked generators."""
179
+ return {k: v["ema_accuracy"] for k, v in self._load().items()}
180
+
181
+ def flagged_generators(self) -> list[str]:
182
+ """Return labels whose retention dropped below the 85% threshold."""
183
+ return [k for k, v in self._load().items() if v.get("flagged")]
184
+
185
+ def check_retention(self) -> None:
186
+ """
187
+ APScheduler job: log retention status and warn on degraded generators.
188
+
189
+ Called automatically on a schedule (e.g., every 6 hours).
190
+ Any generator below RETENTION_THRESHOLD is logged as a warning so that
191
+ operators can trigger a manual review cycle.
192
+ """
193
+ flagged = self.flagged_generators()
194
+ scores = self.retention_scores()
195
+
196
+ if not scores:
197
+ logger.info("GeneratorRegistry: no retention data recorded yet.")
198
+ return
199
+
200
+ logger.info(
201
+ "GeneratorRegistry retention check — %d generators tracked, %d flagged.",
202
+ len(scores),
203
+ len(flagged),
204
+ )
205
+ for label, acc in sorted(scores.items()):
206
+ level = logging.WARNING if acc < RETENTION_THRESHOLD else logging.DEBUG
207
+ logger.log(level, " %s: EMA accuracy = %.1f%%", label, acc * 100)
208
+
209
+ if flagged:
210
+ logger.warning(
211
+ "Generators below %.0f%% retention threshold: %s. "
212
+ "Consider triggering an incremental update cycle.",
213
+ RETENTION_THRESHOLD * 100,
214
+ ", ".join(flagged),
215
+ )
216
+
217
+ # ------------------------------------------------------------------
218
+ # Internal
219
+ # ------------------------------------------------------------------
220
+
221
+ def _load(self) -> dict[str, Any]:
222
+ if not self._path.exists():
223
+ return {}
224
+ try:
225
+ return json.loads(self._path.read_text(encoding="utf-8"))
226
+ except json.JSONDecodeError:
227
+ logger.warning("GeneratorRegistry file corrupt; starting fresh.")
228
+ return {}
229
+
230
+ def _save(self, data: dict[str, Any]) -> None:
231
+ self._path.parent.mkdir(parents=True, exist_ok=True)
232
+ self._path.write_text(
233
+ json.dumps(data, indent=2, default=str),
234
+ encoding="utf-8",
235
+ )
src/engines/coherence/engine.py CHANGED
@@ -2,10 +2,13 @@ from __future__ import annotations
2
 
3
  import logging
4
  import os
 
 
5
  import threading
6
  import time
7
  import urllib.request
8
  from pathlib import Path
 
9
 
10
  import numpy as np
11
  from PIL import Image
@@ -194,7 +197,20 @@ class CoherenceEngine:
194
  logger.warning("Coherence image scoring failed: %s", exc)
195
  return 0.35
196
 
197
- def run_video(self, frames: list[np.ndarray]) -> EngineResult:
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  t0 = time.perf_counter()
199
  self._ensure()
200
 
@@ -216,21 +232,171 @@ class CoherenceEngine:
216
  delta = self._embedding_variance(frames)
217
  jerk = self._landmark_jerk(frames)
218
  blink = self._blink_anomaly(frames)
219
- score = float(np.clip(delta * 0.45 + jerk * 0.35 + blink * 0.20, 0.0, 1.0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
  return EngineResult(
222
  engine="coherence",
223
  verdict="FAKE" if score > 0.5 else "REAL",
224
  confidence=score,
225
  attributed_generator=None,
226
- explanation=(
227
- f"Embedding variance {delta:.2f}, "
228
- f"landmark jerk {jerk:.2f}, "
229
- f"blink anomaly {blink:.2f}."
230
- ),
231
  processing_time_ms=(time.perf_counter() - t0) * 1000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  )
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  def _embedding_variance(self, frames: list[np.ndarray]) -> float:
235
  if _mtcnn is None or _resnet is None or _torch is None:
236
  return 0.5
 
2
 
3
  import logging
4
  import os
5
+ import subprocess
6
+ import tempfile
7
  import threading
8
  import time
9
  import urllib.request
10
  from pathlib import Path
11
+ from typing import Optional
12
 
13
  import numpy as np
14
  from PIL import Image
 
197
  logger.warning("Coherence image scoring failed: %s", exc)
198
  return 0.35
199
 
200
+ def run_video(
201
+ self,
202
+ frames: list[np.ndarray],
203
+ video_path: Optional[str] = None,
204
+ ) -> EngineResult:
205
+ """
206
+ Temporal coherence analysis.
207
+
208
+ Args:
209
+ frames: RGB frames extracted from the video.
210
+ video_path: Optional path to the source video file. When provided,
211
+ audio is extracted and MFCC lip-sync cross-correlation
212
+ is computed (paper Module 1 / LipFD extension).
213
+ """
214
  t0 = time.perf_counter()
215
  self._ensure()
216
 
 
232
  delta = self._embedding_variance(frames)
233
  jerk = self._landmark_jerk(frames)
234
  blink = self._blink_anomaly(frames)
235
+ visual_score = float(np.clip(delta * 0.45 + jerk * 0.35 + blink * 0.20, 0.0, 1.0))
236
+
237
+ # Audio lip-sync cross-correlation (LipFD-inspired, paper §III-A)
238
+ audio_anomaly: Optional[float] = None
239
+ timestamp_markers: list[dict] = []
240
+ if video_path is not None:
241
+ audio_anomaly, timestamp_markers = self._audio_lipsync_score(video_path, frames)
242
+
243
+ if audio_anomaly is not None:
244
+ # Weighted: visual 60%, audio 40% (paper weights for Module 1)
245
+ score = float(np.clip(visual_score * 0.60 + audio_anomaly * 0.40, 0.0, 1.0))
246
+ explanation = (
247
+ f"Embedding variance {delta:.2f}, landmark jerk {jerk:.2f}, "
248
+ f"blink anomaly {blink:.2f}. "
249
+ f"Audio lip-sync anomaly {audio_anomaly:.2f} "
250
+ f"({len(timestamp_markers)} flagged segment(s))."
251
+ )
252
+ else:
253
+ score = visual_score
254
+ explanation = (
255
+ f"Embedding variance {delta:.2f}, "
256
+ f"landmark jerk {jerk:.2f}, "
257
+ f"blink anomaly {blink:.2f}."
258
+ )
259
 
260
  return EngineResult(
261
  engine="coherence",
262
  verdict="FAKE" if score > 0.5 else "REAL",
263
  confidence=score,
264
  attributed_generator=None,
265
+ explanation=explanation,
 
 
 
 
266
  processing_time_ms=(time.perf_counter() - t0) * 1000,
267
+ audio_sync_score=audio_anomaly,
268
+ timestamp_markers=timestamp_markers,
269
+ )
270
+
271
+ def _audio_lipsync_score(
272
+ self,
273
+ video_path: str,
274
+ frames: list[np.ndarray],
275
+ ) -> tuple[float, list[dict]]:
276
+ """
277
+ MFCC cross-correlation with lip-aperture motion curve (paper §III-A).
278
+
279
+ Extracts mono 16 kHz audio via ffmpeg, computes MFCC energy envelope,
280
+ computes per-frame lip-aperture from MediaPipe, resamples both to the
281
+ same length, and returns the Pearson correlation as an anomaly score.
282
+
283
+ Returns:
284
+ (sync_anomaly_score, timestamp_markers)
285
+ sync_anomaly_score: 0 = perfectly in sync, 1 = totally out of sync
286
+ timestamp_markers: list of {start_s, end_s, correlation} dicts for
287
+ segments where correlation < 0.2
288
+ """
289
+ try:
290
+ import librosa # type: ignore
291
+ from scipy.stats import pearsonr # type: ignore
292
+ except ImportError as exc:
293
+ logger.warning("Audio analysis unavailable (missing dep): %s", exc)
294
+ return 0.35, []
295
+
296
+ audio_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
297
+ audio_path = audio_tmp.name
298
+ audio_tmp.close()
299
+
300
+ try:
301
+ cmd = [
302
+ "ffmpeg", "-i", video_path,
303
+ "-ac", "1", "-ar", "16000",
304
+ "-vn", # no video output
305
+ "-f", "wav",
306
+ audio_path,
307
+ "-y", "-loglevel", "error",
308
+ ]
309
+ result = subprocess.run(cmd, capture_output=True, timeout=30)
310
+ if result.returncode != 0:
311
+ logger.debug("ffmpeg audio extract returned %d (no audio?)", result.returncode)
312
+ return 0.35, []
313
+
314
+ try:
315
+ y, sr = librosa.load(audio_path, sr=16000, mono=True)
316
+ except Exception as exc:
317
+ logger.warning("librosa load failed: %s", exc)
318
+ return 0.35, []
319
+ finally:
320
+ Path(audio_path).unlink(missing_ok=True)
321
+
322
+ if len(y) < sr * 0.5:
323
+ return 0.35, [] # less than 0.5 s of audio → inconclusive
324
+
325
+ # Audio energy envelope from MFCC
326
+ hop_length = 512
327
+ try:
328
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length)
329
+ audio_curve = np.mean(np.abs(mfcc), axis=0).astype(np.float32)
330
+ except Exception as exc:
331
+ logger.warning("MFCC computation failed: %s", exc)
332
+ return 0.35, []
333
+
334
+ # Lip-aperture curve from MediaPipe (inner upper lip=13, lower=14)
335
+ if _face_mesh is None:
336
+ return 0.35, []
337
+
338
+ lip_apertures: list[float] = []
339
+ for frame in frames:
340
+ try:
341
+ res = _face_mesh.process(frame)
342
+ if res.multi_face_landmarks:
343
+ lm = res.multi_face_landmarks[0].landmark
344
+ h, w = frame.shape[:2]
345
+ upper = np.array([lm[13].x * w, lm[13].y * h], dtype=np.float32)
346
+ lower = np.array([lm[14].x * w, lm[14].y * h], dtype=np.float32)
347
+ lip_apertures.append(float(np.linalg.norm(upper - lower)))
348
+ else:
349
+ lip_apertures.append(0.0)
350
+ except Exception:
351
+ lip_apertures.append(0.0)
352
+
353
+ if len(lip_apertures) < 4 or float(np.std(lip_apertures)) < 1e-6:
354
+ return 0.35, [] # static lip → can't measure sync
355
+
356
+ # Resample lip curve to match audio_curve length
357
+ lip_curve = np.array(lip_apertures, dtype=np.float32)
358
+ target_len = len(audio_curve)
359
+ lip_resampled = np.interp(
360
+ np.linspace(0, len(lip_curve) - 1, target_len),
361
+ np.arange(len(lip_curve)),
362
+ lip_curve,
363
  )
364
 
365
+ if target_len < 4:
366
+ return 0.35, []
367
+
368
+ # Overall Pearson correlation
369
+ try:
370
+ r_overall, _ = pearsonr(audio_curve, lip_resampled)
371
+ except Exception:
372
+ r_overall = 0.0
373
+
374
+ # Map correlation → anomaly score
375
+ # Real speech: r typically > 0.3; deepfake: often < 0.1 or negative
376
+ sync_anomaly = float(np.clip((0.3 - float(r_overall)) / 0.5 + 0.35, 0.0, 1.0))
377
+
378
+ # Sliding-window timestamp markers for low-correlation segments
379
+ hop_s = hop_length / sr # seconds per MFCC frame
380
+ markers: list[dict] = []
381
+ window = max(10, target_len // 10)
382
+ stride = max(1, window // 2)
383
+
384
+ for i in range(0, target_len - window, stride):
385
+ seg_audio = audio_curve[i : i + window]
386
+ seg_lip = lip_resampled[i : i + window]
387
+ try:
388
+ r_seg, _ = pearsonr(seg_audio, seg_lip)
389
+ except Exception:
390
+ continue
391
+ if float(r_seg) < 0.2:
392
+ markers.append({
393
+ "start_s": round(i * hop_s, 2),
394
+ "end_s": round((i + window) * hop_s, 2),
395
+ "correlation": round(float(r_seg), 3),
396
+ })
397
+
398
+ return sync_anomaly, markers
399
+
400
  def _embedding_variance(self, frames: list[np.ndarray]) -> float:
401
  if _mtcnn is None or _resnet is None or _torch is None:
402
  return 0.5
src/engines/fingerprint/engine.py CHANGED
@@ -29,14 +29,15 @@ DETECTOR_CANDIDATES = [
29
  ]
30
 
31
  GENERATOR_PROMPTS: dict[str, str] = {
32
- "real": "a real photograph taken by a camera with natural lighting and grain",
33
- "unknown_gan": "a GAN-generated image with checkerboard artifacts and blurry edges",
34
- "stable_diffusion": "an image generated by Stable Diffusion with painterly soft textures and dreamlike quality",
35
- "midjourney": "an image generated by Midjourney with cinematic dramatic lighting and extreme hyperdetail",
36
- "dall_e": "an image generated by DALL-E with clean flat illustration style and smooth gradients",
37
- "flux": "an image generated by FLUX with photorealistic high-frequency detail and sharp textures",
38
- "firefly": "an image generated by Adobe Firefly with polished commercial stock-photo aesthetics",
39
- "imagen": "an image generated by Google Imagen with precise photorealistic rendering and clean edges",
 
40
  }
41
 
42
  FAKE_LABEL_KEYWORDS = (
@@ -68,6 +69,10 @@ _clip_model: Optional[CLIPModel] = None
68
  _clip_processor: Optional[CLIPProcessor] = None
69
  _loaded = False
70
 
 
 
 
 
71
 
72
  def _get_pipeline():
73
  try:
@@ -195,7 +200,12 @@ class FingerprintEngine:
195
  except Exception as exc:
196
  logger.warning("Detector %s inference error: %s", model_id, _short_error(exc))
197
 
198
- fake_score = (weighted_fake / total_w) if total_w > 0 else 0.5
 
 
 
 
 
199
  generator = self._attribute_generator(image, fake_score)
200
 
201
  return EngineResult(
@@ -204,7 +214,7 @@ class FingerprintEngine:
204
  confidence=float(fake_score),
205
  attributed_generator=generator,
206
  explanation=(
207
- f"Ensemble fake score {fake_score:.2f} across {len(_detectors)} detectors. "
208
  f"Generator attributed to: {generator}."
209
  ),
210
  processing_time_ms=(time.perf_counter() - t0) * 1000,
@@ -212,7 +222,8 @@ class FingerprintEngine:
212
 
213
  def _attribute_generator(self, image: Image.Image, fake_score: float) -> str:
214
  if _clip_model is None or _clip_processor is None:
215
- return "unknown_gan" if fake_score > 0.5 else "real"
 
216
 
217
  try:
218
  texts = list(GENERATOR_PROMPTS.values())
@@ -225,18 +236,74 @@ class FingerprintEngine:
225
  max_length=77,
226
  )
227
  with torch.no_grad():
228
- logits = _clip_model(**inputs).logits_per_image[0]
 
 
 
 
 
229
  probs = logits.softmax(dim=0).cpu().numpy()
230
- generator = list(GENERATOR_PROMPTS.keys())[int(np.argmax(probs))]
 
 
 
 
 
 
231
 
232
  if fake_score > 0.65 and generator == "real":
233
- generator = "unknown_gan"
234
  if fake_score < 0.35 and generator != "real":
235
  generator = "real"
236
  return generator
237
  except Exception as exc:
238
  logger.warning("CLIP attribution error: %s", _short_error(exc))
239
- return "unknown_gan" if fake_score > 0.5 else "real"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
  def run_video(self, frames: list) -> EngineResult:
242
  t0 = time.perf_counter()
 
29
  ]
30
 
31
  GENERATOR_PROMPTS: dict[str, str] = {
32
+ "real": "a real photograph taken by a camera with natural lighting and film grain",
33
+ "sora": "a Sora text-to-video frame with temporal coherence and photorealistic lighting",
34
+ "runway": "a Runway Gen-2 frame with painterly dreamlike motion blur and color grading",
35
+ "wav2lip": "a Wav2Lip face-swap with sharp lip boundary artifacts and texture inconsistency at mouth edges",
36
+ "stable_diffusion": "an image generated by Stable Diffusion with painterly soft textures and dreamlike quality",
37
+ "sdxl": "an image generated by SDXL with high resolution detail, sharp edges and crisp textures",
38
+ "midjourney": "an image generated by Midjourney with cinematic dramatic lighting and extreme hyperdetail",
39
+ "dall_e": "an image generated by DALL-E with clean flat illustration style and smooth gradients",
40
+ "unknown_generative": "an AI-generated image with unidentifiable generator-specific artifacts and synthetic patterns",
41
  }
42
 
43
  FAKE_LABEL_KEYWORDS = (
 
69
  _clip_processor: Optional[CLIPProcessor] = None
70
  _loaded = False
71
 
72
+ # Thread-local storage: each request thread stores its last CLIP embedding here
73
+ # so the novelty detector can consume it without a second forward pass.
74
+ _thread_local = threading.local()
75
+
76
 
77
  def _get_pipeline():
78
  try:
 
200
  except Exception as exc:
201
  logger.warning("Detector %s inference error: %s", model_id, _short_error(exc))
202
 
203
+ ensemble_score = (weighted_fake / total_w) if total_w > 0 else 0.5
204
+
205
+ # DCT frequency band analysis (paper §III-B / Kim et al.)
206
+ dct_score = self._dct_frequency_score(image)
207
+ fake_score = float(np.clip(ensemble_score * 0.85 + dct_score * 0.15, 0.0, 1.0))
208
+
209
  generator = self._attribute_generator(image, fake_score)
210
 
211
  return EngineResult(
 
214
  confidence=float(fake_score),
215
  attributed_generator=generator,
216
  explanation=(
217
+ f"Ensemble {ensemble_score:.2f} × 0.85 + DCT {dct_score:.2f} × 0.15 = {fake_score:.2f}. "
218
  f"Generator attributed to: {generator}."
219
  ),
220
  processing_time_ms=(time.perf_counter() - t0) * 1000,
 
222
 
223
  def _attribute_generator(self, image: Image.Image, fake_score: float) -> str:
224
  if _clip_model is None or _clip_processor is None:
225
+ _thread_local.last_clip_embedding = None
226
+ return "unknown_generative" if fake_score > 0.5 else "real"
227
 
228
  try:
229
  texts = list(GENERATOR_PROMPTS.values())
 
236
  max_length=77,
237
  )
238
  with torch.no_grad():
239
+ outputs = _clip_model(**inputs)
240
+ logits = outputs.logits_per_image[0]
241
+ # Store image embedding for novelty detection
242
+ image_embeds = outputs.image_embeds.detach().cpu().numpy()[0]
243
+ _thread_local.last_clip_embedding = image_embeds
244
+
245
  probs = logits.softmax(dim=0).cpu().numpy()
246
+ max_prob = float(np.max(probs))
247
+
248
+ # Low confidence attribution → unknown generator
249
+ if max_prob < 0.25:
250
+ generator = "unknown_generative"
251
+ else:
252
+ generator = list(GENERATOR_PROMPTS.keys())[int(np.argmax(probs))]
253
 
254
  if fake_score > 0.65 and generator == "real":
255
+ generator = "unknown_generative"
256
  if fake_score < 0.35 and generator != "real":
257
  generator = "real"
258
  return generator
259
  except Exception as exc:
260
  logger.warning("CLIP attribution error: %s", _short_error(exc))
261
+ _thread_local.last_clip_embedding = None
262
+ return "unknown_generative" if fake_score > 0.5 else "real"
263
+
264
+ def _dct_frequency_score(self, image: Image.Image) -> float:
265
+ """
266
+ DCT frequency band analysis (paper §III-B).
267
+ High-frequency energy ratio is an anomaly signal: real photos follow
268
+ a predictable DCT energy roll-off; AI generators often deviate.
269
+ Returns float [0, 1] where higher = more anomalous.
270
+ """
271
+ try:
272
+ from scipy.fft import dctn # type: ignore
273
+
274
+ gray = np.array(image.convert("L"), dtype=np.float32)
275
+ h, w = gray.shape
276
+ # Align to 8×8 block boundary (JPEG-DCT standard)
277
+ bh, bw = h - h % 8, w - w % 8
278
+ if bh < 8 or bw < 8:
279
+ return 0.3
280
+ crop = gray[:bh, :bw]
281
+ # Reshape into (n_blocks_h, n_blocks_w, 8, 8) then DCT each 8×8 block
282
+ blocks = crop.reshape(bh // 8, 8, bw // 8, 8).transpose(0, 2, 1, 3)
283
+ n_bh, n_bw = blocks.shape[:2]
284
+
285
+ dc_energy_total = 0.0
286
+ all_energy_total = 0.0
287
+ for bi in range(n_bh):
288
+ for bj in range(n_bw):
289
+ dct_block = dctn(blocks[bi, bj], norm="ortho")
290
+ dc_energy_total += float(dct_block[0, 0] ** 2)
291
+ all_energy_total += float(np.sum(dct_block ** 2))
292
+
293
+ if all_energy_total < 1e-9:
294
+ return 0.3
295
+
296
+ ac_ratio = 1.0 - (dc_energy_total / all_energy_total)
297
+ # Real photos: ac_ratio ≈ 0.80–0.90; AI images can deviate significantly
298
+ score = float(np.clip(abs(ac_ratio - 0.85) / 0.15, 0.0, 1.0))
299
+ return score
300
+ except Exception as exc:
301
+ logger.warning("DCT frequency score error: %s", _short_error(exc))
302
+ return 0.3
303
+
304
+ def get_last_clip_embedding(self) -> Optional[np.ndarray]:
305
+ """Return the CLIP image embedding from the most recent run() call in this thread."""
306
+ return getattr(_thread_local, "last_clip_embedding", None)
307
 
308
  def run_video(self, frames: list) -> EngineResult:
309
  t0 = time.perf_counter()
src/engines/sstgnn/engine.py CHANGED
@@ -303,6 +303,61 @@ class SSTGNNEngine:
303
  logger.warning("Geometry score error: %s", exc)
304
  return 0.3
305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  def run_video(self, frames: list[np.ndarray]) -> EngineResult:
307
  t0 = time.perf_counter()
308
  self._ensure()
@@ -319,14 +374,23 @@ class SSTGNNEngine:
319
 
320
  sample = frames[::6] or [frames[0]]
321
  results = [self.run(Image.fromarray(frame)) for frame in sample]
322
- avg = float(np.mean([r.confidence for r in results]))
 
 
 
 
 
 
323
 
324
  return EngineResult(
325
  engine="sstgnn",
326
  verdict="FAKE" if avg > 0.5 else "REAL",
327
  confidence=avg,
328
  attributed_generator=None,
329
- explanation=f"Frame-sampled SSTGNN average {avg:.2f} over {len(sample)} frames.",
 
 
 
330
  processing_time_ms=(time.perf_counter() - t0) * 1000,
331
  )
332
 
 
303
  logger.warning("Geometry score error: %s", exc)
304
  return 0.3
305
 
306
+ def _temporal_fft_score(self, frames: list[np.ndarray]) -> float:
307
+ """
308
+ Pixel-wise 1D FFT over the time axis (paper §III-C / Kim et al. [7]).
309
+
310
+ For each pixel position in a 32×32 downsampled grid, the 1D FFT is
311
+ computed across T frame samples. Real video concentrates energy in the
312
+ DC component (slow, smooth motion). Deepfakes often exhibit elevated
313
+ high-frequency temporal components due to frame-level inconsistencies.
314
+
315
+ Returns float [0, 1] where higher = more anomalous.
316
+ """
317
+ try:
318
+ import cv2 # type: ignore
319
+
320
+ if len(frames) < 8:
321
+ return 0.3
322
+
323
+ # Sample up to 32 frames evenly
324
+ step = max(1, len(frames) // 32)
325
+ sampled = frames[::step][:32]
326
+ if len(sampled) < 4:
327
+ return 0.3
328
+
329
+ # Downsample each frame to 32×32 grayscale float32
330
+ gray_stack = np.array(
331
+ [
332
+ cv2.resize(
333
+ cv2.cvtColor(f, cv2.COLOR_RGB2GRAY)
334
+ if (f.ndim == 3 and f.shape[2] >= 3)
335
+ else f[:, :, 0] if f.ndim == 3 else f,
336
+ (32, 32),
337
+ ).astype(np.float32)
338
+ for f in sampled
339
+ ]
340
+ ) # shape: (T, 32, 32)
341
+
342
+ # 1D real FFT along time axis
343
+ fft_result = np.fft.rfft(gray_stack, axis=0) # (T//2+1, 32, 32)
344
+ power = np.abs(fft_result) ** 2 # power spectrum
345
+
346
+ dc_power = power[0] # (32, 32)
347
+ total_power = np.sum(power, axis=0) + 1e-9 # (32, 32)
348
+ hf_ratio = 1.0 - (dc_power / total_power) # per-pixel HF ratio
349
+ mean_hf = float(np.mean(hf_ratio))
350
+
351
+ # Real video: mean_hf ≈ 0.20–0.40 (most energy in slow motion).
352
+ # Deepfakes deviate in either direction (flickering >0.55 or
353
+ # unnaturally smooth <0.10). Centre of normal range = 0.30.
354
+ score = float(np.clip(abs(mean_hf - 0.30) / 0.25, 0.0, 1.0))
355
+ return score
356
+
357
+ except Exception as exc:
358
+ logger.warning("Temporal FFT score error: %s", _short_error(exc))
359
+ return 0.3
360
+
361
  def run_video(self, frames: list[np.ndarray]) -> EngineResult:
362
  t0 = time.perf_counter()
363
  self._ensure()
 
374
 
375
  sample = frames[::6] or [frames[0]]
376
  results = [self.run(Image.fromarray(frame)) for frame in sample]
377
+ cnn_geo_avg = float(np.mean([r.confidence for r in results]))
378
+
379
+ # Pixel-wise temporal FFT (paper §III-C / Kim et al. [7])
380
+ fft_score = self._temporal_fft_score(frames)
381
+
382
+ # Final: CNN+geometry 80%, temporal FFT 20%
383
+ avg = float(np.clip(cnn_geo_avg * 0.80 + fft_score * 0.20, 0.0, 1.0))
384
 
385
  return EngineResult(
386
  engine="sstgnn",
387
  verdict="FAKE" if avg > 0.5 else "REAL",
388
  confidence=avg,
389
  attributed_generator=None,
390
+ explanation=(
391
+ f"CNN+geometry avg {cnn_geo_avg:.2f} over {len(sample)} frames, "
392
+ f"temporal FFT anomaly {fft_score:.2f}."
393
+ ),
394
  processing_time_ms=(time.perf_counter() - t0) * 1000,
395
  )
396
 
src/fusion/fuser.py CHANGED
@@ -1,27 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
  import numpy as np
4
 
5
  from src.types import DetectionResponse, EngineResult
6
 
7
- ENGINE_WEIGHTS = {
8
- "fingerprint": 0.45,
9
- "coherence": 0.35,
10
- "sstgnn": 0.20,
 
 
11
  }
12
-
13
- ENGINE_WEIGHTS_VIDEO = {
14
- "fingerprint": 0.30,
15
- "coherence": 0.50,
16
- "sstgnn": 0.20,
17
  }
18
 
19
- ATTRIBUTION_PRIORITY = {
 
20
  "fingerprint": 1,
21
- "sstgnn": 2,
22
- "coherence": 3,
23
  }
24
 
 
 
 
25
 
26
  def _normalize_generator(value: str | None) -> str:
27
  if not value:
@@ -29,31 +49,103 @@ def _normalize_generator(value: str | None) -> str:
29
  return str(value).strip().lower().replace(" ", "_")
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def fuse(results: list[EngineResult], is_video: bool = False) -> tuple[str, float, str]:
33
- """Return (verdict, confidence_for_verdict, attributed_generator)."""
 
 
 
34
 
35
- weights = ENGINE_WEIGHTS_VIDEO if is_video else ENGINE_WEIGHTS
36
- active = [result for result in results if result.verdict != "UNKNOWN"]
 
 
 
37
 
38
  if not active:
39
- return "UNKNOWN", 0.5, "unknown_gan"
40
 
41
- wf = sum(
42
- result.confidence * weights.get(result.engine, 0.1)
43
- for result in active
44
- if result.verdict == "FAKE"
45
- )
46
- wr = sum(
47
- (1.0 - result.confidence) * weights.get(result.engine, 0.1)
48
- for result in active
49
- if result.verdict == "REAL"
50
- )
51
 
52
- denom = wf + wr + 1e-9
53
- fake_prob = float(np.clip(wf / denom, 0.0, 1.0))
 
 
 
 
 
54
  verdict = "FAKE" if fake_prob > 0.5 else "REAL"
55
  confidence = fake_prob if verdict == "FAKE" else (1.0 - fake_prob)
56
 
 
57
  generator = "real"
58
  if verdict == "FAKE":
59
  for result in sorted(active, key=lambda r: ATTRIBUTION_PRIORITY.get(r.engine, 9)):
@@ -62,9 +154,9 @@ def fuse(results: list[EngineResult], is_video: bool = False) -> tuple[str, floa
62
  generator = candidate
63
  break
64
  if generator == "real":
65
- generator = "unknown_gan"
66
 
67
- return verdict, confidence, generator
68
 
69
 
70
  class Fuser:
@@ -80,7 +172,7 @@ class Fuser:
80
  return DetectionResponse(
81
  verdict="REAL",
82
  confidence=0.5,
83
- attributed_generator="unknown_gan",
84
  explanation="No engine results available.",
85
  processing_time_ms=round(total_ms, 2),
86
  engine_breakdown=[],
@@ -95,7 +187,9 @@ class Fuser:
95
  f"{result.engine}:{result.verdict}({result.confidence:.2f})"
96
  for result in results
97
  )
98
- explanation = f"Fused {media_type} analysis from engines: {summary}."
 
 
99
 
100
  return DetectionResponse(
101
  verdict=verdict,
 
1
+ """
2
+ src/fusion/fuser.py — Multi-engine evidence fusion.
3
+
4
+ Implements Dempster-Shafer (DS) evidence theory combination of the three
5
+ detection engine outputs (paper §III-E / Module 5).
6
+
7
+ DS replaces the previous simple weighted average. Each engine produces a
8
+ Basic Probability Assignment (BPA) over {FAKE, REAL, Θ} where Θ is the
9
+ set of all hypotheses (total ignorance). DS combination normalises away
10
+ the conflict between contradictory masses, yielding a combined BPA that
11
+ reflects consensus while respecting uncertainty.
12
+
13
+ The final confidence is derived via the pignistic probability transform
14
+ (Smets), which distributes the ignorance mass equally between FAKE and REAL.
15
+ """
16
  from __future__ import annotations
17
 
18
  import numpy as np
19
 
20
  from src.types import DetectionResponse, EngineResult
21
 
22
+ # Engine reliability weights used to build each engine's BPA.
23
+ # Higher weight → engine commits more mass to its verdict, less to Θ.
24
+ ENGINE_RELIABILITY: dict[str, float] = {
25
+ "fingerprint": 0.70,
26
+ "coherence": 0.65,
27
+ "sstgnn": 0.60,
28
  }
29
+ ENGINE_RELIABILITY_VIDEO: dict[str, float] = {
30
+ "fingerprint": 0.55,
31
+ "coherence": 0.75,
32
+ "sstgnn": 0.65,
 
33
  }
34
 
35
+ # Attribution priority: which engine's generator label is most trusted
36
+ ATTRIBUTION_PRIORITY: dict[str, int] = {
37
  "fingerprint": 1,
38
+ "sstgnn": 2,
39
+ "coherence": 3,
40
  }
41
 
42
+ # Type alias for a Basic Probability Assignment over {FAKE, REAL, Θ}
43
+ _BPA = dict[str, float]
44
+
45
 
46
  def _normalize_generator(value: str | None) -> str:
47
  if not value:
 
49
  return str(value).strip().lower().replace(" ", "_")
50
 
51
 
52
+ def _engine_to_bpa(result: EngineResult, is_video: bool = False) -> _BPA:
53
+ """
54
+ Convert an EngineResult into a Basic Probability Assignment.
55
+
56
+ The engine reliability weight (w) determines how much mass is committed
57
+ to the engine's verdict vs. left as ignorance (Θ).
58
+
59
+ BPA structure:
60
+ m({FAKE}) + m({REAL}) + m(Θ) = 1.0
61
+ """
62
+ weights = ENGINE_RELIABILITY_VIDEO if is_video else ENGINE_RELIABILITY
63
+ w = weights.get(result.engine, 0.50)
64
+ c = float(result.confidence)
65
+
66
+ if result.verdict == "UNKNOWN":
67
+ return {"FAKE": 0.0, "REAL": 0.0, "Θ": 1.0}
68
+ if result.verdict == "FAKE":
69
+ return {
70
+ "FAKE": c * w,
71
+ "REAL": (1.0 - c) * w,
72
+ "Θ": 1.0 - w,
73
+ }
74
+ # verdict == "REAL"
75
+ return {
76
+ "REAL": c * w,
77
+ "FAKE": (1.0 - c) * w,
78
+ "Θ": 1.0 - w,
79
+ }
80
+
81
+
82
+ def _ds_combine(m1: _BPA, m2: _BPA) -> _BPA:
83
+ """
84
+ Dempster's combination rule for two BPAs over {FAKE, REAL, Θ}.
85
+
86
+ K = conflict = Σ_{A∩B=∅} m1(A)·m2(B)
87
+ m12(C) = Σ_{A∩B=C} m1(A)·m2(B) / (1 - K) for C ≠ ∅
88
+ """
89
+ # Conflict mass: FAKE ∩ REAL = ∅, so conflict = FAKE×REAL + REAL×FAKE
90
+ K = m1["FAKE"] * m2["REAL"] + m1["REAL"] * m2["FAKE"]
91
+
92
+ # Unnormalised joint masses
93
+ raw_fake = (
94
+ m1["FAKE"] * m2["FAKE"] # FAKE ∩ FAKE = FAKE
95
+ + m1["FAKE"] * m2["Θ"] # FAKE ∩ Θ = FAKE
96
+ + m1["Θ"] * m2["FAKE"] # Θ ∩ FAKE = FAKE
97
+ )
98
+ raw_real = (
99
+ m1["REAL"] * m2["REAL"]
100
+ + m1["REAL"] * m2["Θ"]
101
+ + m1["Θ"] * m2["REAL"]
102
+ )
103
+ raw_theta = m1["Θ"] * m2["Θ"] # Θ ∩ Θ = Θ
104
+
105
+ norm = 1.0 - K
106
+ if norm < 1e-9:
107
+ # Total conflict → maximum uncertainty
108
+ return {"FAKE": 0.5, "REAL": 0.5, "Θ": 0.0}
109
+
110
+ return {
111
+ "FAKE": raw_fake / norm,
112
+ "REAL": raw_real / norm,
113
+ "Θ": raw_theta / norm,
114
+ }
115
+
116
+
117
  def fuse(results: list[EngineResult], is_video: bool = False) -> tuple[str, float, str]:
118
+ """
119
+ Dempster-Shafer fusion of engine results.
120
+
121
+ Returns (verdict, confidence_for_verdict, attributed_generator).
122
 
123
+ Confidence is derived via the pignistic probability transform (Smets 1990):
124
+ ignorance mass Θ is split equally between FAKE and REAL before thresholding.
125
+ This avoids overconfident verdicts when engines disagree.
126
+ """
127
+ active = [r for r in results if r.verdict != "UNKNOWN"]
128
 
129
  if not active:
130
+ return "UNKNOWN", 0.5, "unknown_generative"
131
 
132
+ # Build and combine BPAs iteratively
133
+ bpas = [_engine_to_bpa(r, is_video) for r in active]
134
+ combined = bpas[0]
135
+ for bpa in bpas[1:]:
136
+ combined = _ds_combine(combined, bpa)
 
 
 
 
 
137
 
138
+ # Pignistic transform: distribute Θ mass equally
139
+ theta = combined.get("Θ", 0.0)
140
+ pign_fake = combined["FAKE"] + theta / 2.0
141
+ pign_real = combined["REAL"] + theta / 2.0
142
+ pign_total = pign_fake + pign_real + 1e-9
143
+
144
+ fake_prob = float(np.clip(pign_fake / pign_total, 0.0, 1.0))
145
  verdict = "FAKE" if fake_prob > 0.5 else "REAL"
146
  confidence = fake_prob if verdict == "FAKE" else (1.0 - fake_prob)
147
 
148
+ # Generator attribution: highest-priority engine with a non-real label
149
  generator = "real"
150
  if verdict == "FAKE":
151
  for result in sorted(active, key=lambda r: ATTRIBUTION_PRIORITY.get(r.engine, 9)):
 
154
  generator = candidate
155
  break
156
  if generator == "real":
157
+ generator = "unknown_generative"
158
 
159
+ return verdict, float(np.clip(confidence, 0.0, 1.0)), generator
160
 
161
 
162
  class Fuser:
 
172
  return DetectionResponse(
173
  verdict="REAL",
174
  confidence=0.5,
175
+ attributed_generator="unknown_generative",
176
  explanation="No engine results available.",
177
  processing_time_ms=round(total_ms, 2),
178
  engine_breakdown=[],
 
187
  f"{result.engine}:{result.verdict}({result.confidence:.2f})"
188
  for result in results
189
  )
190
+ explanation = (
191
+ f"Dempster-Shafer fusion ({media_type}) from engines: {summary}."
192
+ )
193
 
194
  return DetectionResponse(
195
  verdict=verdict,
src/types.py CHANGED
@@ -12,27 +12,29 @@ from pydantic import BaseModel, field_validator
12
 
13
 
14
  class GeneratorLabel(str, Enum):
15
- """Generator attribution labels used across the pipeline."""
16
 
17
  real = "real"
18
- unknown_gan = "unknown_gan"
 
 
19
  stable_diffusion = "stable_diffusion"
 
20
  midjourney = "midjourney"
21
  dall_e = "dall_e"
22
- flux = "flux"
23
- firefly = "firefly"
24
- imagen = "imagen"
25
 
26
 
27
  GENERATOR_INDEX_TO_LABEL: dict[int, GeneratorLabel] = {
28
  0: GeneratorLabel.real,
29
- 1: GeneratorLabel.unknown_gan,
30
- 2: GeneratorLabel.stable_diffusion,
31
- 3: GeneratorLabel.midjourney,
32
- 4: GeneratorLabel.dall_e,
33
- 5: GeneratorLabel.flux,
34
- 6: GeneratorLabel.firefly,
35
- 7: GeneratorLabel.imagen,
 
36
  }
37
 
38
 
@@ -46,6 +48,10 @@ class EngineResult(BaseModel):
46
  explanation: str = ""
47
  processing_time_ms: float = 0.0
48
 
 
 
 
 
49
  @field_validator("confidence")
50
  @classmethod
51
  def confidence_in_range(cls, value: float) -> float:
@@ -71,6 +77,13 @@ class DetectionResponse(BaseModel):
71
  processing_time_ms: float
72
  engine_breakdown: list[EngineResult]
73
 
 
 
 
 
 
 
 
74
  # Optional explainability metadata
75
  clarity_score: Optional[float] = None
76
  saliency_map_url: Optional[str] = None
 
12
 
13
 
14
  class GeneratorLabel(str, Enum):
15
+ """Generator attribution labels aligned to paper's 8-generator taxonomy."""
16
 
17
  real = "real"
18
+ sora = "sora"
19
+ runway = "runway"
20
+ wav2lip = "wav2lip"
21
  stable_diffusion = "stable_diffusion"
22
+ sdxl = "sdxl"
23
  midjourney = "midjourney"
24
  dall_e = "dall_e"
25
+ unknown_generative = "unknown_generative"
 
 
26
 
27
 
28
  GENERATOR_INDEX_TO_LABEL: dict[int, GeneratorLabel] = {
29
  0: GeneratorLabel.real,
30
+ 1: GeneratorLabel.sora,
31
+ 2: GeneratorLabel.runway,
32
+ 3: GeneratorLabel.wav2lip,
33
+ 4: GeneratorLabel.stable_diffusion,
34
+ 5: GeneratorLabel.sdxl,
35
+ 6: GeneratorLabel.midjourney,
36
+ 7: GeneratorLabel.dall_e,
37
+ 8: GeneratorLabel.unknown_generative,
38
  }
39
 
40
 
 
48
  explanation: str = ""
49
  processing_time_ms: float = 0.0
50
 
51
+ # Audio coherence sub-scores (populated by CoherenceEngine on video input)
52
+ audio_sync_score: Optional[float] = None
53
+ timestamp_markers: list[dict] = []
54
+
55
  @field_validator("confidence")
56
  @classmethod
57
  def confidence_in_range(cls, value: float) -> float:
 
77
  processing_time_ms: float
78
  engine_breakdown: list[EngineResult]
79
 
80
+ # Module 4 — Continual Learning novelty signal
81
+ novelty_score: Optional[float] = None
82
+
83
+ # Module 1 — Audio lip-sync coherence sub-scores
84
+ audio_sync_score: Optional[float] = None
85
+ timestamp_markers: list[dict] = []
86
+
87
  # Optional explainability metadata
88
  clarity_score: Optional[float] = None
89
  saliency_map_url: Optional[str] = None