akagtag commited on
Commit
1e24aab
·
1 Parent(s): c909ee6

Update model stack, add audio stream extraction, and harden inference wiring

Browse files
requirements.txt CHANGED
@@ -18,6 +18,7 @@ torchvision>=0.16.0
18
  facenet-pytorch>=2.5.3; python_version < "3.13"
19
  mediapipe>=0.10.14
20
  opencv-python-headless>=4.9.0
 
21
 
22
  # ML - sstgnn
23
  torch-geometric>=2.5.0
 
18
  facenet-pytorch>=2.5.3; python_version < "3.13"
19
  mediapipe>=0.10.14
20
  opencv-python-headless>=4.9.0
21
+ librosa>=0.10.2
22
 
23
  # ML - sstgnn
24
  torch-geometric>=2.5.0
runpod_handler.py CHANGED
@@ -16,34 +16,13 @@ from src.engines.fingerprint.engine import FingerprintEngine
16
  from src.engines.sstgnn.engine import SSTGNNEngine
17
  from src.explainability.explainer import explain
18
  from src.fusion.fuser import fuse
 
19
 
20
  _fp = FingerprintEngine()
21
  _co = CoherenceEngine()
22
  _st = SSTGNNEngine()
23
 
24
 
25
- def _extract_frames(video_path: str) -> list:
26
- try:
27
- import cv2
28
- except Exception:
29
- return []
30
-
31
- cap = cv2.VideoCapture(video_path)
32
- frames = []
33
- index = 0
34
- while True:
35
- ret, frame = cap.read()
36
- if not ret:
37
- break
38
- if index % 4 == 0:
39
- frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
40
- index += 1
41
- if len(frames) >= 300:
42
- break
43
- cap.release()
44
- return frames
45
-
46
-
47
  def handler(job: dict) -> dict:
48
  inp = job.get("input", {})
49
  encoded = inp.get("data") or inp.get("image_b64")
@@ -67,12 +46,18 @@ def handler(job: dict) -> dict:
67
  tmp_path = temp.name
68
 
69
  try:
70
- frames = _extract_frames(tmp_path)
 
71
  finally:
72
  os.unlink(tmp_path)
73
 
 
 
 
 
 
74
  fp = _fp.run_video(frames)
75
- co = _co.run_video(frames)
76
  st = _st.run_video(frames)
77
  verdict, conf, generator = fuse([fp, co, st], is_video=True)
78
 
 
16
  from src.engines.sstgnn.engine import SSTGNNEngine
17
  from src.explainability.explainer import explain
18
  from src.fusion.fuser import fuse
19
+ from src.services.media_utils import extract_audio_waveform, extract_video_frames
20
 
21
  _fp = FingerprintEngine()
22
  _co = CoherenceEngine()
23
  _st = SSTGNNEngine()
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def handler(job: dict) -> dict:
27
  inp = job.get("input", {})
28
  encoded = inp.get("data") or inp.get("image_b64")
 
46
  tmp_path = temp.name
47
 
48
  try:
49
+ frames = extract_video_frames(tmp_path, max_frames=300)
50
+ audio = extract_audio_waveform(tmp_path, sample_rate=16000)
51
  finally:
52
  os.unlink(tmp_path)
53
 
54
+ audio_waveform = None
55
+ audio_sample_rate = 16000
56
+ if audio is not None:
57
+ audio_waveform, audio_sample_rate = audio
58
+
59
  fp = _fp.run_video(frames)
60
+ co = _co.run_video(frames, audio_waveform, audio_sample_rate)
61
  st = _st.run_video(frames)
62
  verdict, conf, generator = fuse([fp, co, st], is_video=True)
63
 
src/api/main.py CHANGED
@@ -19,7 +19,7 @@ from PIL import Image
19
  from src.engines.coherence.engine import CoherenceEngine
20
  from src.engines.fingerprint.engine import FingerprintEngine
21
  from src.engines.sstgnn.engine import SSTGNNEngine
22
- from src.explainability.explainer import explain
23
  from src.fusion.fuser import fuse
24
  from src.services.hf_inference_client import HFInferenceClient, HFInferenceUnavailable
25
  from src.services.inference_router import (
@@ -27,6 +27,7 @@ from src.services.inference_router import (
27
  is_runpod_configured,
28
  route_inference,
29
  )
 
30
  from src.types import DetectionResponse, EngineResult
31
 
32
  logger = logging.getLogger(__name__)
@@ -83,20 +84,32 @@ SUPPORTED_GENERATORS = [
83
  def _model_inventory() -> dict[str, object]:
84
  return {
85
  "fingerprint": {
86
- "primary_detector": "Organika/sdxl-detector",
87
- "backup_detector": "haywoodsloan/ai-image-detector-deploy",
 
 
 
 
 
 
 
88
  "attribution_model": "openai/clip-vit-large-patch14",
89
  },
90
  "coherence": {
91
- "hf_fallback_model": os.environ.get("COHERENCE_HF_MODEL_ID", "Wvolf/ViT_Deepfake_Detection"),
 
 
 
92
  "facial_landmarks": "mediapipe FaceMesh/FaceLandmarker",
93
  "temporal_embedding": "facenet-pytorch InceptionResnetV1(vggface2) when available",
94
  },
95
  "sstgnn": {
96
- "primary_detector": "dima806/deepfake_vs_real_image_detection",
97
- "backup_detector": "prithivMLmods/Deep-Fake-Detector-Model",
98
  "graph_component": "scipy.spatial.Delaunay + MediaPipe landmarks",
99
  },
 
 
 
100
  "generator_labels": SUPPORTED_GENERATORS,
101
  }
102
 
@@ -137,32 +150,6 @@ async def health_models() -> dict[str, object]:
137
  return _model_inventory()
138
 
139
 
140
- def _extract_frames(path: str) -> list[np.ndarray]:
141
- try:
142
- import cv2
143
- except Exception as exc:
144
- raise RuntimeError(f"OpenCV unavailable: {exc}") from exc
145
-
146
- cap = cv2.VideoCapture(path)
147
- total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
148
- step = max(1, total // MAX_FRAMES) if total > 0 else 1
149
-
150
- frames: list[np.ndarray] = []
151
- index = 0
152
- while True:
153
- ret, frame = cap.read()
154
- if not ret:
155
- break
156
- if index % step == 0:
157
- frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
158
- index += 1
159
- if len(frames) >= MAX_FRAMES:
160
- break
161
-
162
- cap.release()
163
- return frames
164
-
165
-
166
  def _assign_processing_time(results: list[EngineResult], ms: float) -> list[EngineResult]:
167
  for result in results:
168
  result.processing_time_ms = round(ms, 2)
@@ -256,7 +243,7 @@ async def _hf_detect_video(data: bytes) -> DetectionResponse:
256
  tmp_path = tmp.name
257
 
258
  try:
259
- frames = await asyncio.to_thread(_extract_frames, tmp_path)
260
  finally:
261
  Path(tmp_path).unlink(missing_ok=True)
262
 
@@ -404,7 +391,9 @@ async def detect_video(file: UploadFile = File(...)) -> DetectionResponse:
404
  tmp_path = tmp.name
405
 
406
  try:
407
- frames = await asyncio.to_thread(_extract_frames, tmp_path)
 
 
408
  finally:
409
  Path(tmp_path).unlink(missing_ok=True)
410
 
@@ -412,10 +401,14 @@ async def detect_video(file: UploadFile = File(...)) -> DetectionResponse:
412
  raise HTTPException(status_code=422, detail="Could not extract frames")
413
 
414
  await _ensure_models_loaded()
 
 
 
 
415
 
416
  fp, co, st = await asyncio.gather(
417
  asyncio.to_thread(_fp.run_video, frames),
418
- asyncio.to_thread(_co.run_video, frames),
419
  asyncio.to_thread(_st.run_video, frames),
420
  )
421
 
 
19
  from src.engines.coherence.engine import CoherenceEngine
20
  from src.engines.fingerprint.engine import FingerprintEngine
21
  from src.engines.sstgnn.engine import SSTGNNEngine
22
+ from src.explainability.explainer import MODEL_CANDIDATES, explain
23
  from src.fusion.fuser import fuse
24
  from src.services.hf_inference_client import HFInferenceClient, HFInferenceUnavailable
25
  from src.services.inference_router import (
 
27
  is_runpod_configured,
28
  route_inference,
29
  )
30
+ from src.services.media_utils import extract_audio_waveform, extract_video_frames
31
  from src.types import DetectionResponse, EngineResult
32
 
33
  logger = logging.getLogger(__name__)
 
84
  def _model_inventory() -> dict[str, object]:
85
  return {
86
  "fingerprint": {
87
+ "ensemble_detectors": [
88
+ "yermandy/deepfake-detection",
89
+ "yermandy/GenD_CLIP_L_14",
90
+ "yermandy/GenD_DINOv3_L",
91
+ "Wvolf/ViT_Deepfake_Detection",
92
+ "prithivMLmods/Deep-Fake-Detector-v2-Model",
93
+ "Smogy/SMOGY-Ai-images-detector",
94
+ ],
95
+ "ensemble_weights": [1.4, 1.4, 1.1, 1.0, 1.0, 0.9],
96
  "attribution_model": "openai/clip-vit-large-patch14",
97
  },
98
  "coherence": {
99
+ "audio_deepfake_model": os.environ.get(
100
+ "COHERENCE_AUDIO_MODEL_ID",
101
+ "nii-yamagishilab/wav2vec-large-anti-deepfake-nda",
102
+ ),
103
  "facial_landmarks": "mediapipe FaceMesh/FaceLandmarker",
104
  "temporal_embedding": "facenet-pytorch InceptionResnetV1(vggface2) when available",
105
  },
106
  "sstgnn": {
107
+ "pretrained_hf_models": [],
 
108
  "graph_component": "scipy.spatial.Delaunay + MediaPipe landmarks",
109
  },
110
+ "explainability": {
111
+ "gemini_model_candidates": list(MODEL_CANDIDATES),
112
+ },
113
  "generator_labels": SUPPORTED_GENERATORS,
114
  }
115
 
 
150
  return _model_inventory()
151
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def _assign_processing_time(results: list[EngineResult], ms: float) -> list[EngineResult]:
154
  for result in results:
155
  result.processing_time_ms = round(ms, 2)
 
243
  tmp_path = tmp.name
244
 
245
  try:
246
+ frames = await asyncio.to_thread(extract_video_frames, tmp_path, MAX_FRAMES)
247
  finally:
248
  Path(tmp_path).unlink(missing_ok=True)
249
 
 
391
  tmp_path = tmp.name
392
 
393
  try:
394
+ frames_task = asyncio.to_thread(extract_video_frames, tmp_path, MAX_FRAMES)
395
+ audio_task = asyncio.to_thread(extract_audio_waveform, tmp_path, 16000)
396
+ frames, audio = await asyncio.gather(frames_task, audio_task)
397
  finally:
398
  Path(tmp_path).unlink(missing_ok=True)
399
 
 
401
  raise HTTPException(status_code=422, detail="Could not extract frames")
402
 
403
  await _ensure_models_loaded()
404
+ audio_waveform = None
405
+ audio_sample_rate = 16000
406
+ if audio is not None:
407
+ audio_waveform, audio_sample_rate = audio
408
 
409
  fp, co, st = await asyncio.gather(
410
  asyncio.to_thread(_fp.run_video, frames),
411
+ asyncio.to_thread(_co.run_video, frames, audio_waveform, audio_sample_rate),
412
  asyncio.to_thread(_st.run_video, frames),
413
  )
414
 
src/engines/coherence/detector.py CHANGED
@@ -6,8 +6,10 @@ from __future__ import annotations
6
 
7
  import os
8
  import tempfile
 
9
 
10
  from src.types import EngineResult
 
11
 
12
  from .engine import CoherenceEngine
13
 
@@ -16,39 +18,26 @@ class CoherenceDetector(CoherenceEngine):
16
  threshold = 0.5
17
 
18
  def detect_bytes(self, video_bytes: bytes) -> EngineResult:
19
- frames = self._extract_video_frames(video_bytes)
20
  if not frames:
21
  return self._error_result(0.0)
22
  try:
23
- return self.run_video(frames)
24
  except Exception:
25
  return self._error_result(0.0)
26
 
27
- def _extract_video_frames(self, video_bytes: bytes) -> list:
28
- try:
29
- import cv2
30
- except Exception:
31
- return []
32
-
33
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
34
  tmp.write(video_bytes)
35
  tmp_path = tmp.name
36
 
37
- frames = []
38
  try:
39
- cap = cv2.VideoCapture(tmp_path)
40
- index = 0
41
- while True:
42
- ok, frame = cap.read()
43
- if not ok:
44
- break
45
- if index % 2 == 0:
46
- frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
47
- index += 1
48
- if len(frames) >= 64:
49
- break
50
- cap.release()
51
- return frames
52
  finally:
53
  os.unlink(tmp_path)
54
 
 
6
 
7
  import os
8
  import tempfile
9
+ import numpy as np
10
 
11
  from src.types import EngineResult
12
+ from src.services.media_utils import extract_audio_waveform, extract_video_frames
13
 
14
  from .engine import CoherenceEngine
15
 
 
18
  threshold = 0.5
19
 
20
  def detect_bytes(self, video_bytes: bytes) -> EngineResult:
21
+ frames, audio_waveform, audio_sample_rate = self._extract_video_media(video_bytes)
22
  if not frames:
23
  return self._error_result(0.0)
24
  try:
25
+ return self.run_video(frames, audio_waveform, audio_sample_rate)
26
  except Exception:
27
  return self._error_result(0.0)
28
 
29
+ def _extract_video_media(self, video_bytes: bytes) -> tuple[list[np.ndarray], np.ndarray | None, int]:
 
 
 
 
 
30
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
31
  tmp.write(video_bytes)
32
  tmp_path = tmp.name
33
 
 
34
  try:
35
+ frames = extract_video_frames(tmp_path, max_frames=64)
36
+ audio = extract_audio_waveform(tmp_path, sample_rate=16000)
37
+ if audio is None:
38
+ return frames, None, 16000
39
+ waveform, sample_rate = audio
40
+ return frames, waveform, sample_rate
 
 
 
 
 
 
 
41
  finally:
42
  os.unlink(tmp_path)
43
 
src/engines/coherence/engine.py CHANGED
@@ -21,7 +21,7 @@ _mtcnn = None
21
  _resnet = None
22
  _face_mesh = None
23
  _torch = None
24
- _hf_detector = None
25
 
26
 
27
  def _skip_model_loads() -> bool:
@@ -88,8 +88,8 @@ def _build_face_mesh():
88
  static_image_mode=False,
89
  max_num_faces=1,
90
  refine_landmarks=True,
91
- min_detection_confidence=0.5,
92
- )
93
 
94
  from mediapipe.tasks import python as mp_tasks_python # type: ignore
95
  from mediapipe.tasks.python import vision # type: ignore
@@ -104,22 +104,30 @@ def _build_face_mesh():
104
  return _TasksFaceMeshAdapter(mp, landmarker)
105
 
106
 
107
- def _build_image_classifier(model_id: str) -> Any:
108
  pipeline = _get_pipeline()
109
 
110
  cache_dir = os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
111
  try:
112
  return pipeline(
113
- "image-classification",
114
  model=model_id,
115
- model_kwargs={"cache_dir": cache_dir},
 
116
  )
117
  except Exception:
118
- return pipeline("image-classification", model=model_id)
 
 
 
 
 
 
 
119
 
120
 
121
  def _load() -> None:
122
- global _mtcnn, _resnet, _face_mesh, _load_attempted, _torch, _hf_detector
123
  if _load_attempted:
124
  return
125
 
@@ -152,10 +160,13 @@ def _load() -> None:
152
  logger.warning("Coherence embedding model load failed, using heuristic-only mode: %s", exc)
153
 
154
  try:
155
- model_id = os.environ.get("COHERENCE_HF_MODEL_ID", "Wvolf/ViT_Deepfake_Detection")
156
- _hf_detector = _build_image_classifier(model_id)
 
 
 
157
  except Exception as exc:
158
- logger.warning("Coherence HF fallback model unavailable: %s", exc)
159
 
160
  logger.info("Coherence model load attempt complete")
161
 
@@ -171,7 +182,6 @@ class CoherenceEngine:
171
 
172
  frame = np.array(image.convert("RGB"))
173
  score = self._image_score(frame)
174
- score = float(np.clip(score * 0.6 + self._hf_image_score(image) * 0.4, 0.0, 1.0))
175
 
176
  return EngineResult(
177
  engine="coherence",
@@ -214,7 +224,12 @@ class CoherenceEngine:
214
  logger.warning("Coherence image scoring failed: %s", exc)
215
  return 0.35
216
 
217
- def run_video(self, frames: list[np.ndarray]) -> EngineResult:
 
 
 
 
 
218
  t0 = time.perf_counter()
219
  self._ensure()
220
 
@@ -236,9 +251,8 @@ class CoherenceEngine:
236
  delta = self._embedding_variance(frames)
237
  jerk = self._landmark_jerk(frames)
238
  blink = self._blink_anomaly(frames)
239
-
240
- hf_video = self._hf_video_score(frames)
241
- score = float(np.clip(delta * 0.35 + jerk * 0.30 + blink * 0.15 + hf_video * 0.20, 0.0, 1.0))
242
 
243
  return EngineResult(
244
  engine="coherence",
@@ -249,44 +263,45 @@ class CoherenceEngine:
249
  f"Embedding variance {delta:.2f}, "
250
  f"landmark jerk {jerk:.2f}, "
251
  f"blink anomaly {blink:.2f}, "
252
- f"hf score {hf_video:.2f}."
253
  ),
254
  processing_time_ms=(time.perf_counter() - t0) * 1000,
255
  )
256
 
257
- def _hf_image_score(self, image: Image.Image) -> float:
258
- if _hf_detector is None:
259
  return 0.5
260
- try:
261
- preds = _hf_detector(image)
262
- return self._fake_score_from_preds(preds)
263
- except Exception:
264
  return 0.5
265
 
266
- def _hf_video_score(self, frames: list[np.ndarray]) -> float:
267
- if _hf_detector is None or not frames:
268
- return 0.5
269
- values: list[float] = []
270
- for frame in frames[::8]:
271
- try:
272
- preds = _hf_detector(Image.fromarray(frame))
273
- values.append(self._fake_score_from_preds(preds))
274
- except Exception:
275
- continue
276
- if not values:
277
  return 0.5
278
- return float(np.clip(np.mean(values), 0.0, 1.0))
279
 
280
- def _fake_score_from_preds(self, preds: list[dict]) -> float:
 
 
 
281
  if not preds:
282
  return 0.5
283
- keywords = ("fake", "deepfake", "generated", "synthetic", "ai", "artificial")
 
284
  best = 0.0
285
  for pred in preds:
286
  label = str(pred.get("label", "")).lower()
287
  score = float(pred.get("score", 0.0))
288
- if any(keyword in label for keyword in keywords):
289
  best = max(best, score)
 
290
  if best == 0.0:
291
  return 0.5
292
  return float(np.clip(best, 0.0, 1.0))
 
21
  _resnet = None
22
  _face_mesh = None
23
  _torch = None
24
+ _audio_detector = None
25
 
26
 
27
  def _skip_model_loads() -> bool:
 
88
  static_image_mode=False,
89
  max_num_faces=1,
90
  refine_landmarks=True,
91
+ min_detection_confidence=0.5,
92
+ )
93
 
94
  from mediapipe.tasks import python as mp_tasks_python # type: ignore
95
  from mediapipe.tasks.python import vision # type: ignore
 
104
  return _TasksFaceMeshAdapter(mp, landmarker)
105
 
106
 
107
+ def _build_audio_classifier(model_id: str) -> Any:
108
  pipeline = _get_pipeline()
109
 
110
  cache_dir = os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
111
  try:
112
  return pipeline(
113
+ "audio-classification",
114
  model=model_id,
115
+ trust_remote_code=True,
116
+ model_kwargs={"cache_dir": cache_dir, "trust_remote_code": True},
117
  )
118
  except Exception:
119
+ try:
120
+ return pipeline(
121
+ "audio-classification",
122
+ model=model_id,
123
+ model_kwargs={"cache_dir": cache_dir},
124
+ )
125
+ except Exception:
126
+ return pipeline("audio-classification", model=model_id)
127
 
128
 
129
  def _load() -> None:
130
+ global _mtcnn, _resnet, _face_mesh, _load_attempted, _torch, _audio_detector
131
  if _load_attempted:
132
  return
133
 
 
160
  logger.warning("Coherence embedding model load failed, using heuristic-only mode: %s", exc)
161
 
162
  try:
163
+ model_id = os.environ.get(
164
+ "COHERENCE_AUDIO_MODEL_ID",
165
+ "nii-yamagishilab/wav2vec-large-anti-deepfake-nda",
166
+ )
167
+ _audio_detector = _build_audio_classifier(model_id)
168
  except Exception as exc:
169
+ logger.warning("Coherence audio model unavailable: %s", exc)
170
 
171
  logger.info("Coherence model load attempt complete")
172
 
 
182
 
183
  frame = np.array(image.convert("RGB"))
184
  score = self._image_score(frame)
 
185
 
186
  return EngineResult(
187
  engine="coherence",
 
224
  logger.warning("Coherence image scoring failed: %s", exc)
225
  return 0.35
226
 
227
+ def run_video(
228
+ self,
229
+ frames: list[np.ndarray],
230
+ audio_waveform: np.ndarray | None = None,
231
+ audio_sample_rate: int = 16000,
232
+ ) -> EngineResult:
233
  t0 = time.perf_counter()
234
  self._ensure()
235
 
 
251
  delta = self._embedding_variance(frames)
252
  jerk = self._landmark_jerk(frames)
253
  blink = self._blink_anomaly(frames)
254
+ audio = self._audio_deepfake_score(audio_waveform, audio_sample_rate)
255
+ score = float(np.clip(delta * 0.35 + jerk * 0.30 + blink * 0.15 + audio * 0.20, 0.0, 1.0))
 
256
 
257
  return EngineResult(
258
  engine="coherence",
 
263
  f"Embedding variance {delta:.2f}, "
264
  f"landmark jerk {jerk:.2f}, "
265
  f"blink anomaly {blink:.2f}, "
266
+ f"audio deepfake score {audio:.2f}."
267
  ),
268
  processing_time_ms=(time.perf_counter() - t0) * 1000,
269
  )
270
 
271
+ def _audio_deepfake_score(self, waveform: np.ndarray | None = None, sample_rate: int = 16000) -> float:
272
+ if _audio_detector is None:
273
  return 0.5
274
+ if waveform is None or waveform.size == 0:
 
 
 
275
  return 0.5
276
 
277
+ max_seconds = int(os.environ.get("COHERENCE_AUDIO_MAX_SECONDS", "30"))
278
+ max_samples = max(16000, sample_rate * max_seconds)
279
+ if waveform.size > max_samples:
280
+ waveform = waveform[:max_samples]
281
+
282
+ try:
283
+ preds = _audio_detector(
284
+ {"array": waveform.astype(np.float32), "sampling_rate": sample_rate},
285
+ top_k=5,
286
+ )
287
+ except Exception:
288
  return 0.5
 
289
 
290
+ if isinstance(preds, dict):
291
+ preds = [preds]
292
+ if preds and isinstance(preds[0], list):
293
+ preds = preds[0]
294
  if not preds:
295
  return 0.5
296
+
297
+ fake_keywords = ("spoof", "fake", "deepfake", "synthetic", "generated")
298
  best = 0.0
299
  for pred in preds:
300
  label = str(pred.get("label", "")).lower()
301
  score = float(pred.get("score", 0.0))
302
+ if any(keyword in label for keyword in fake_keywords):
303
  best = max(best, score)
304
+
305
  if best == 0.0:
306
  return 0.5
307
  return float(np.clip(best, 0.0, 1.0))
src/engines/fingerprint/engine.py CHANGED
@@ -31,9 +31,9 @@ _FAKE_KEYWORDS = ("artificial", "fake", "ai", "generated", "deepfake", "syntheti
31
 
32
  _lock = threading.Lock()
33
  _load_attempted = False
34
- _detector = None
 
35
  _clip_zeroshot = None
36
- _backup = None
37
 
38
 
39
  def _skip_model_loads() -> bool:
@@ -61,10 +61,18 @@ def _build_image_classifier(model_id: str) -> Any:
61
  return pipeline(
62
  "image-classification",
63
  model=model_id,
64
- model_kwargs={"cache_dir": CACHE},
 
65
  )
66
  except Exception:
67
- return pipeline("image-classification", model=model_id)
 
 
 
 
 
 
 
68
 
69
 
70
  def _build_zero_shot_image_classifier(model_id: str) -> Any:
@@ -74,14 +82,22 @@ def _build_zero_shot_image_classifier(model_id: str) -> Any:
74
  return pipeline(
75
  "zero-shot-image-classification",
76
  model=model_id,
77
- model_kwargs={"cache_dir": CACHE},
 
78
  )
79
  except Exception:
80
- return pipeline("zero-shot-image-classification", model=model_id)
 
 
 
 
 
 
 
81
 
82
 
83
  def _load() -> None:
84
- global _detector, _clip_zeroshot, _backup, _load_attempted
85
  if _load_attempted:
86
  return
87
 
@@ -93,13 +109,42 @@ def _load() -> None:
93
  logger.info("Loading fingerprint models...")
94
 
95
  try:
96
- _detector = _build_image_classifier("Organika/sdxl-detector")
97
- _clip_zeroshot = _build_zero_shot_image_classifier("openai/clip-vit-large-patch14")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- try:
100
- _backup = _build_image_classifier("haywoodsloan/ai-image-detector-deploy")
101
- except Exception:
102
- logger.warning("Backup fingerprint detector unavailable")
 
 
 
 
 
 
 
 
103
 
104
  except Exception as exc:
105
  logger.warning("Fingerprint models unavailable: %s", exc)
@@ -136,18 +181,18 @@ class FingerprintEngine:
136
  image = image.convert("RGB")
137
 
138
  fake_score = 0.5
139
- try:
140
- if _detector is not None:
141
- fake_score = _fake_score(_detector(image))
142
- except Exception as exc:
143
- logger.warning("Primary detector error: %s", exc)
144
-
145
- if _backup is not None:
146
  try:
147
- backup_score = _fake_score(_backup(image))
148
- fake_score = float(np.clip(fake_score * 0.6 + backup_score * 0.4, 0.0, 1.0))
149
- except Exception:
150
- pass
 
 
 
 
151
 
152
  generator = "real"
153
  try:
 
31
 
32
  _lock = threading.Lock()
33
  _load_attempted = False
34
+ _detectors: list[Any] = []
35
+ _detector_weights: list[float] = []
36
  _clip_zeroshot = None
 
37
 
38
 
39
  def _skip_model_loads() -> bool:
 
61
  return pipeline(
62
  "image-classification",
63
  model=model_id,
64
+ trust_remote_code=True,
65
+ model_kwargs={"cache_dir": CACHE, "trust_remote_code": True},
66
  )
67
  except Exception:
68
+ try:
69
+ return pipeline(
70
+ "image-classification",
71
+ model=model_id,
72
+ model_kwargs={"cache_dir": CACHE},
73
+ )
74
+ except Exception:
75
+ return pipeline("image-classification", model=model_id)
76
 
77
 
78
  def _build_zero_shot_image_classifier(model_id: str) -> Any:
 
82
  return pipeline(
83
  "zero-shot-image-classification",
84
  model=model_id,
85
+ trust_remote_code=True,
86
+ model_kwargs={"cache_dir": CACHE, "trust_remote_code": True},
87
  )
88
  except Exception:
89
+ try:
90
+ return pipeline(
91
+ "zero-shot-image-classification",
92
+ model=model_id,
93
+ model_kwargs={"cache_dir": CACHE},
94
+ )
95
+ except Exception:
96
+ return pipeline("zero-shot-image-classification", model=model_id)
97
 
98
 
99
  def _load() -> None:
100
+ global _detectors, _detector_weights, _clip_zeroshot, _load_attempted
101
  if _load_attempted:
102
  return
103
 
 
109
  logger.info("Loading fingerprint models...")
110
 
111
  try:
112
+ configured_models = [
113
+ model_id.strip()
114
+ for model_id in os.environ.get(
115
+ "FINGERPRINT_MODEL_IDS",
116
+ (
117
+ "yermandy/deepfake-detection,"
118
+ "yermandy/GenD_CLIP_L_14,"
119
+ "yermandy/GenD_DINOv3_L,"
120
+ "Wvolf/ViT_Deepfake_Detection,"
121
+ "prithivMLmods/Deep-Fake-Detector-v2-Model,"
122
+ "Smogy/SMOGY-Ai-images-detector"
123
+ ),
124
+ ).split(",")
125
+ if model_id.strip()
126
+ ]
127
+ configured_weights = [
128
+ value.strip()
129
+ for value in os.environ.get(
130
+ "FINGERPRINT_MODEL_WEIGHTS",
131
+ "1.4,1.4,1.1,1.0,1.0,0.9",
132
+ ).split(",")
133
+ if value.strip()
134
+ ]
135
 
136
+ for index, model_id in enumerate(configured_models):
137
+ try:
138
+ _detectors.append(_build_image_classifier(model_id))
139
+ try:
140
+ _detector_weights.append(float(configured_weights[index]))
141
+ except Exception:
142
+ _detector_weights.append(1.0)
143
+ logger.info("Loaded fingerprint detector: %s", model_id)
144
+ except Exception as exc:
145
+ logger.warning("Fingerprint detector unavailable (%s): %s", model_id, exc)
146
+
147
+ _clip_zeroshot = _build_zero_shot_image_classifier("openai/clip-vit-large-patch14")
148
 
149
  except Exception as exc:
150
  logger.warning("Fingerprint models unavailable: %s", exc)
 
181
  image = image.convert("RGB")
182
 
183
  fake_score = 0.5
184
+ weighted_scores: list[float] = []
185
+ weight_total = 0.0
186
+ for idx, detector in enumerate(_detectors):
 
 
 
 
187
  try:
188
+ score = _fake_score(detector(image))
189
+ weight = _detector_weights[idx] if idx < len(_detector_weights) else 1.0
190
+ weighted_scores.append(score * max(weight, 0.0))
191
+ weight_total += max(weight, 0.0)
192
+ except Exception as exc:
193
+ logger.warning("Fingerprint detector inference error: %s", exc)
194
+ if weighted_scores and weight_total > 0.0:
195
+ fake_score = float(np.clip(sum(weighted_scores) / weight_total, 0.0, 1.0))
196
 
197
  generator = "real"
198
  try:
src/engines/sstgnn/engine.py CHANGED
@@ -6,7 +6,6 @@ import threading
6
  import time
7
  import urllib.request
8
  from pathlib import Path
9
- from typing import Any
10
 
11
  import numpy as np
12
  from PIL import Image
@@ -14,12 +13,9 @@ from PIL import Image
14
  from src.types import EngineResult
15
 
16
  logger = logging.getLogger(__name__)
17
- CACHE = os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
18
 
19
  _lock = threading.Lock()
20
  _load_attempted = False
21
- _det1 = None
22
- _det2 = None
23
  _mesh = None
24
  _delaunay = None
25
 
@@ -33,14 +29,6 @@ def _skip_model_loads() -> bool:
33
  }
34
 
35
 
36
- def _get_pipeline():
37
- try:
38
- from transformers import pipeline as hf_pipeline # type: ignore
39
- except Exception:
40
- from transformers.pipelines import pipeline as hf_pipeline # type: ignore
41
- return hf_pipeline
42
-
43
-
44
  KEYPOINT_STEP = 7
45
  KEYPOINT_COUNT = 68
46
 
@@ -84,20 +72,7 @@ def _ensure_face_landmarker_asset() -> Path:
84
  return model_path
85
 
86
 
87
- def _build_image_classifier(model_id: str) -> Any:
88
- pipeline = _get_pipeline()
89
-
90
- try:
91
- return pipeline(
92
- "image-classification",
93
- model=model_id,
94
- model_kwargs={"cache_dir": CACHE},
95
- )
96
- except Exception:
97
- return pipeline("image-classification", model=model_id)
98
-
99
-
100
- def _build_face_mesh() -> Any:
101
  import mediapipe as mp # type: ignore
102
 
103
  if hasattr(mp, "solutions"):
@@ -121,7 +96,7 @@ def _build_face_mesh() -> Any:
121
 
122
 
123
  def _load() -> None:
124
- global _det1, _det2, _mesh, _delaunay, _load_attempted
125
  if _load_attempted:
126
  return
127
 
@@ -132,15 +107,6 @@ def _load() -> None:
132
 
133
  logger.info("Loading SSTGNN models...")
134
 
135
- try:
136
- _det1 = _build_image_classifier("dima806/deepfake_vs_real_image_detection")
137
- try:
138
- _det2 = _build_image_classifier("prithivMLmods/Deep-Fake-Detector-Model")
139
- except Exception:
140
- logger.warning("SSTGNN backup detector unavailable")
141
- except Exception as exc:
142
- logger.warning("SSTGNN HF detector load failed: %s", exc)
143
-
144
  try:
145
  _mesh = _build_face_mesh()
146
  except Exception as exc:
@@ -156,19 +122,6 @@ def _load() -> None:
156
  logger.info("SSTGNN model load attempt complete")
157
 
158
 
159
- def _fake_prob(preds: list[dict]) -> float:
160
- fake_keywords = ("fake", "deepfake", "artificial", "generated", "ai", "synthetic")
161
- best = 0.0
162
- for pred in preds:
163
- label = str(pred.get("label", "")).lower()
164
- score = float(pred.get("score", 0.0))
165
- if any(keyword in label for keyword in fake_keywords):
166
- best = max(best, score)
167
- if best == 0.0:
168
- return 0.5
169
- return float(np.clip(best, 0.0, 1.0))
170
-
171
-
172
  class SSTGNNEngine:
173
  def _ensure(self) -> None:
174
  with _lock:
@@ -181,39 +134,15 @@ class SSTGNNEngine:
181
  if image.mode != "RGB":
182
  image = image.convert("RGB")
183
 
184
- scores: list[float] = []
185
- try:
186
- if _det1 is not None:
187
- scores.append(_fake_prob(_det1(image)) * 0.6)
188
- except Exception as exc:
189
- logger.warning("SSTGNN det1 error: %s", exc)
190
-
191
- if _det2 is not None:
192
- try:
193
- scores.append(_fake_prob(_det2(image)) * 0.4)
194
- except Exception as exc:
195
- logger.warning("SSTGNN det2 error: %s", exc)
196
-
197
- if not scores:
198
- return EngineResult(
199
- engine="sstgnn",
200
- verdict="REAL",
201
- confidence=0.5,
202
- attributed_generator=None,
203
- explanation="All detectors failed; returning neutral score.",
204
- processing_time_ms=(time.perf_counter() - t0) * 1000,
205
- )
206
-
207
- cnn = sum(scores) / (0.6 if len(scores) == 1 else 1.0)
208
  graph = self._geometry_score(np.array(image))
209
- final = float(np.clip(cnn * 0.7 + graph * 0.3, 0.0, 1.0))
210
 
211
  return EngineResult(
212
  engine="sstgnn",
213
  verdict="FAKE" if final > 0.5 else "REAL",
214
  confidence=final,
215
  attributed_generator=None,
216
- explanation=f"CNN {cnn:.2f}, geometric graph anomaly {graph:.2f}.",
217
  processing_time_ms=(time.perf_counter() - t0) * 1000,
218
  )
219
 
 
6
  import time
7
  import urllib.request
8
  from pathlib import Path
 
9
 
10
  import numpy as np
11
  from PIL import Image
 
13
  from src.types import EngineResult
14
 
15
  logger = logging.getLogger(__name__)
 
16
 
17
  _lock = threading.Lock()
18
  _load_attempted = False
 
 
19
  _mesh = None
20
  _delaunay = None
21
 
 
29
  }
30
 
31
 
 
 
 
 
 
 
 
 
32
  KEYPOINT_STEP = 7
33
  KEYPOINT_COUNT = 68
34
 
 
72
  return model_path
73
 
74
 
75
+ def _build_face_mesh():
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  import mediapipe as mp # type: ignore
77
 
78
  if hasattr(mp, "solutions"):
 
96
 
97
 
98
  def _load() -> None:
99
+ global _mesh, _delaunay, _load_attempted
100
  if _load_attempted:
101
  return
102
 
 
107
 
108
  logger.info("Loading SSTGNN models...")
109
 
 
 
 
 
 
 
 
 
 
110
  try:
111
  _mesh = _build_face_mesh()
112
  except Exception as exc:
 
122
  logger.info("SSTGNN model load attempt complete")
123
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  class SSTGNNEngine:
126
  def _ensure(self) -> None:
127
  with _lock:
 
134
  if image.mode != "RGB":
135
  image = image.convert("RGB")
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  graph = self._geometry_score(np.array(image))
138
+ final = float(np.clip(graph, 0.0, 1.0))
139
 
140
  return EngineResult(
141
  engine="sstgnn",
142
  verdict="FAKE" if final > 0.5 else "REAL",
143
  confidence=final,
144
  attributed_generator=None,
145
+ explanation=f"Geometric graph anomaly {graph:.2f}.",
146
  processing_time_ms=(time.perf_counter() - t0) * 1000,
147
  )
148
 
src/explainability/explainer.py CHANGED
@@ -26,22 +26,24 @@ SYSTEM_INSTRUCTION = (
26
  "Output only the explanation text."
27
  )
28
 
29
- MODEL_CANDIDATES = (
30
- # Preferred order: Gemini 3.1 first, then 2.5 and legacy fallbacks.
31
  "gemini-3.1-pro-preview",
32
  "gemini-3.1-pro-preview-customtools",
 
 
33
  "gemini-2.5-pro",
34
  "gemini-2.5-flash",
35
  "gemini-2.5-flash-lite",
36
- # Legacy/compatibility fallbacks.
37
- "gemini-2.0-flash",
38
- "gemini-1.5-pro",
39
- "gemini-1.5-pro-latest",
40
- # legacy names kept as last-resort candidates
41
- "gemini-2.5-pro-preview-03-25",
42
- "gemini-1.5-pro-002",
43
  )
44
 
 
 
 
 
 
 
 
45
  REQUEST_TIMEOUT_S = float(os.environ.get("GEMINI_REQUEST_TIMEOUT_S", "10"))
46
  MAX_MODEL_ATTEMPTS = max(1, int(os.environ.get("GEMINI_MAX_MODEL_ATTEMPTS", "3")))
47
  ENABLE_LEGACY_MODEL_DISCOVERY = os.environ.get("GEMINI_DISCOVER_MODELS", "").strip().lower() in {
 
26
  "Output only the explanation text."
27
  )
28
 
29
+ DEFAULT_MODEL_CANDIDATES = (
30
+ # Source: https://ai.google.dev/gemini-api/docs/models (checked March 2026).
31
  "gemini-3.1-pro-preview",
32
  "gemini-3.1-pro-preview-customtools",
33
+ "gemini-3-flash-preview",
34
+ "gemini-3.1-flash-lite-preview",
35
  "gemini-2.5-pro",
36
  "gemini-2.5-flash",
37
  "gemini-2.5-flash-lite",
 
 
 
 
 
 
 
38
  )
39
 
40
+ _configured_candidates = [
41
+ value.strip()
42
+ for value in os.environ.get("GEMINI_MODEL_CANDIDATES", "").split(",")
43
+ if value.strip()
44
+ ]
45
+ MODEL_CANDIDATES = tuple(_configured_candidates) if _configured_candidates else DEFAULT_MODEL_CANDIDATES
46
+
47
  REQUEST_TIMEOUT_S = float(os.environ.get("GEMINI_REQUEST_TIMEOUT_S", "10"))
48
  MAX_MODEL_ATTEMPTS = max(1, int(os.environ.get("GEMINI_MAX_MODEL_ATTEMPTS", "3")))
49
  ENABLE_LEGACY_MODEL_DISCOVERY = os.environ.get("GEMINI_DISCOVER_MODELS", "").strip().lower() in {
src/services/media_utils.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import subprocess
5
+ import tempfile
6
+ import wave
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def extract_video_frames(video_path: str | Path, max_frames: int = 300) -> list[np.ndarray]:
15
+ try:
16
+ import cv2 # type: ignore
17
+ except Exception as exc:
18
+ raise RuntimeError(f"OpenCV unavailable: {exc}") from exc
19
+
20
+ path = str(Path(video_path))
21
+ cap = cv2.VideoCapture(path)
22
+ total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
23
+ step = max(1, total // max_frames) if total > 0 else 1
24
+
25
+ frames: list[np.ndarray] = []
26
+ index = 0
27
+ while True:
28
+ ok, frame = cap.read()
29
+ if not ok:
30
+ break
31
+ if index % step == 0:
32
+ frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
33
+ index += 1
34
+ if len(frames) >= max_frames:
35
+ break
36
+
37
+ cap.release()
38
+ return frames
39
+
40
+
41
+ def extract_audio_waveform(
42
+ video_path: str | Path,
43
+ sample_rate: int = 16000,
44
+ ) -> tuple[np.ndarray, int] | None:
45
+ path = Path(video_path)
46
+ if not path.exists():
47
+ return None
48
+
49
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
50
+ wav_path = Path(tmp.name)
51
+
52
+ cmd = [
53
+ "ffmpeg",
54
+ "-nostdin",
55
+ "-loglevel",
56
+ "error",
57
+ "-y",
58
+ "-i",
59
+ str(path),
60
+ "-vn",
61
+ "-ac",
62
+ "1",
63
+ "-ar",
64
+ str(sample_rate),
65
+ str(wav_path),
66
+ ]
67
+
68
+ try:
69
+ subprocess.run(cmd, check=True, capture_output=True)
70
+ except Exception as exc:
71
+ logger.warning("Audio extraction failed via ffmpeg: %s", exc)
72
+ wav_path.unlink(missing_ok=True)
73
+ return None
74
+
75
+ try:
76
+ with wave.open(str(wav_path), "rb") as wav_file:
77
+ channels = wav_file.getnchannels()
78
+ sr = wav_file.getframerate()
79
+ sampwidth = wav_file.getsampwidth()
80
+ pcm = wav_file.readframes(wav_file.getnframes())
81
+ except Exception as exc:
82
+ logger.warning("Could not read extracted WAV file: %s", exc)
83
+ wav_path.unlink(missing_ok=True)
84
+ return None
85
+ finally:
86
+ wav_path.unlink(missing_ok=True)
87
+
88
+ if not pcm:
89
+ return None
90
+
91
+ if sampwidth == 1:
92
+ arr = np.frombuffer(pcm, dtype=np.uint8).astype(np.float32)
93
+ arr = (arr - 128.0) / 128.0
94
+ elif sampwidth == 2:
95
+ arr = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
96
+ elif sampwidth == 4:
97
+ arr = np.frombuffer(pcm, dtype=np.int32).astype(np.float32) / 2147483648.0
98
+ else:
99
+ logger.warning("Unsupported audio sample width: %s", sampwidth)
100
+ return None
101
+
102
+ if channels > 1:
103
+ arr = arr.reshape(-1, channels).mean(axis=1)
104
+
105
+ arr = np.clip(arr, -1.0, 1.0).astype(np.float32)
106
+ if arr.size == 0:
107
+ return None
108
+ return arr, sr
109
+