Spaces:

akagtag
/

deepdetection

Sleeping

App Files Files Community

akagtag commited on 21 days ago

Commit

1e24aab

1 Parent(s): c909ee6

Update model stack, add audio stream extraction, and harden inference wiring

Browse files

Files changed (9) hide show

requirements.txt +1 -0
runpod_handler.py +9 -24
src/api/main.py +28 -35
src/engines/coherence/detector.py +11 -22
src/engines/coherence/engine.py +53 -38
src/engines/fingerprint/engine.py +69 -24
src/engines/sstgnn/engine.py +4 -75
src/explainability/explainer.py +11 -9
src/services/media_utils.py +109 -0

requirements.txt CHANGED Viewed

@@ -18,6 +18,7 @@ torchvision>=0.16.0
 facenet-pytorch>=2.5.3; python_version < "3.13"
 mediapipe>=0.10.14
 opencv-python-headless>=4.9.0
 # ML - sstgnn
 torch-geometric>=2.5.0

 facenet-pytorch>=2.5.3; python_version < "3.13"
 mediapipe>=0.10.14
 opencv-python-headless>=4.9.0
+librosa>=0.10.2
 # ML - sstgnn
 torch-geometric>=2.5.0

runpod_handler.py CHANGED Viewed

@@ -16,34 +16,13 @@ from src.engines.fingerprint.engine import FingerprintEngine
 from src.engines.sstgnn.engine import SSTGNNEngine
 from src.explainability.explainer import explain
 from src.fusion.fuser import fuse
 _fp = FingerprintEngine()
 _co = CoherenceEngine()
 _st = SSTGNNEngine()
-def _extract_frames(video_path: str) -> list:
-    try:
-        import cv2
-    except Exception:
-        return []
-    cap = cv2.VideoCapture(video_path)
-    frames = []
-    index = 0
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        if index % 4 == 0:
-            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-        index += 1
-        if len(frames) >= 300:
-            break
-    cap.release()
-    return frames
 def handler(job: dict) -> dict:
     inp = job.get("input", {})
     encoded = inp.get("data") or inp.get("image_b64")
@@ -67,12 +46,18 @@ def handler(job: dict) -> dict:
             tmp_path = temp.name
         try:
-            frames = _extract_frames(tmp_path)
         finally:
             os.unlink(tmp_path)
         fp = _fp.run_video(frames)
-        co = _co.run_video(frames)
         st = _st.run_video(frames)
         verdict, conf, generator = fuse([fp, co, st], is_video=True)

 from src.engines.sstgnn.engine import SSTGNNEngine
 from src.explainability.explainer import explain
 from src.fusion.fuser import fuse
+from src.services.media_utils import extract_audio_waveform, extract_video_frames
 _fp = FingerprintEngine()
 _co = CoherenceEngine()
 _st = SSTGNNEngine()
 def handler(job: dict) -> dict:
     inp = job.get("input", {})
     encoded = inp.get("data") or inp.get("image_b64")
             tmp_path = temp.name
         try:
+            frames = extract_video_frames(tmp_path, max_frames=300)
+            audio = extract_audio_waveform(tmp_path, sample_rate=16000)
         finally:
             os.unlink(tmp_path)
+        audio_waveform = None
+        audio_sample_rate = 16000
+        if audio is not None:
+            audio_waveform, audio_sample_rate = audio
         fp = _fp.run_video(frames)
+        co = _co.run_video(frames, audio_waveform, audio_sample_rate)
         st = _st.run_video(frames)
         verdict, conf, generator = fuse([fp, co, st], is_video=True)

src/api/main.py CHANGED Viewed

@@ -19,7 +19,7 @@ from PIL import Image
 from src.engines.coherence.engine import CoherenceEngine
 from src.engines.fingerprint.engine import FingerprintEngine
 from src.engines.sstgnn.engine import SSTGNNEngine
-from src.explainability.explainer import explain
 from src.fusion.fuser import fuse
 from src.services.hf_inference_client import HFInferenceClient, HFInferenceUnavailable
 from src.services.inference_router import (
@@ -27,6 +27,7 @@ from src.services.inference_router import (
     is_runpod_configured,
     route_inference,
 )
 from src.types import DetectionResponse, EngineResult
 logger = logging.getLogger(__name__)
@@ -83,20 +84,32 @@ SUPPORTED_GENERATORS = [
 def _model_inventory() -> dict[str, object]:
     return {
         "fingerprint": {
-            "primary_detector": "Organika/sdxl-detector",
-            "backup_detector": "haywoodsloan/ai-image-detector-deploy",
             "attribution_model": "openai/clip-vit-large-patch14",
         },
         "coherence": {
-            "hf_fallback_model": os.environ.get("COHERENCE_HF_MODEL_ID", "Wvolf/ViT_Deepfake_Detection"),
             "facial_landmarks": "mediapipe FaceMesh/FaceLandmarker",
             "temporal_embedding": "facenet-pytorch InceptionResnetV1(vggface2) when available",
         },
         "sstgnn": {
-            "primary_detector": "dima806/deepfake_vs_real_image_detection",
-            "backup_detector": "prithivMLmods/Deep-Fake-Detector-Model",
             "graph_component": "scipy.spatial.Delaunay + MediaPipe landmarks",
         },
         "generator_labels": SUPPORTED_GENERATORS,
     }
@@ -137,32 +150,6 @@ async def health_models() -> dict[str, object]:
     return _model_inventory()
-def _extract_frames(path: str) -> list[np.ndarray]:
-    try:
-        import cv2
-    except Exception as exc:
-        raise RuntimeError(f"OpenCV unavailable: {exc}") from exc
-    cap = cv2.VideoCapture(path)
-    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    step = max(1, total // MAX_FRAMES) if total > 0 else 1
-    frames: list[np.ndarray] = []
-    index = 0
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        if index % step == 0:
-            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-        index += 1
-        if len(frames) >= MAX_FRAMES:
-            break
-    cap.release()
-    return frames
 def _assign_processing_time(results: list[EngineResult], ms: float) -> list[EngineResult]:
     for result in results:
         result.processing_time_ms = round(ms, 2)
@@ -256,7 +243,7 @@ async def _hf_detect_video(data: bytes) -> DetectionResponse:
         tmp_path = tmp.name
     try:
-        frames = await asyncio.to_thread(_extract_frames, tmp_path)
     finally:
         Path(tmp_path).unlink(missing_ok=True)
@@ -404,7 +391,9 @@ async def detect_video(file: UploadFile = File(...)) -> DetectionResponse:
         tmp_path = tmp.name
     try:
-        frames = await asyncio.to_thread(_extract_frames, tmp_path)
     finally:
         Path(tmp_path).unlink(missing_ok=True)
@@ -412,10 +401,14 @@ async def detect_video(file: UploadFile = File(...)) -> DetectionResponse:
         raise HTTPException(status_code=422, detail="Could not extract frames")
     await _ensure_models_loaded()
     fp, co, st = await asyncio.gather(
         asyncio.to_thread(_fp.run_video, frames),
-        asyncio.to_thread(_co.run_video, frames),
         asyncio.to_thread(_st.run_video, frames),
     )

 from src.engines.coherence.engine import CoherenceEngine
 from src.engines.fingerprint.engine import FingerprintEngine
 from src.engines.sstgnn.engine import SSTGNNEngine
+from src.explainability.explainer import MODEL_CANDIDATES, explain
 from src.fusion.fuser import fuse
 from src.services.hf_inference_client import HFInferenceClient, HFInferenceUnavailable
 from src.services.inference_router import (
     is_runpod_configured,
     route_inference,
 )
+from src.services.media_utils import extract_audio_waveform, extract_video_frames
 from src.types import DetectionResponse, EngineResult
 logger = logging.getLogger(__name__)
 def _model_inventory() -> dict[str, object]:
     return {
         "fingerprint": {
+            "ensemble_detectors": [
+                "yermandy/deepfake-detection",
+                "yermandy/GenD_CLIP_L_14",
+                "yermandy/GenD_DINOv3_L",
+                "Wvolf/ViT_Deepfake_Detection",
+                "prithivMLmods/Deep-Fake-Detector-v2-Model",
+                "Smogy/SMOGY-Ai-images-detector",
+            ],
+            "ensemble_weights": [1.4, 1.4, 1.1, 1.0, 1.0, 0.9],
             "attribution_model": "openai/clip-vit-large-patch14",
         },
         "coherence": {
+            "audio_deepfake_model": os.environ.get(
+                "COHERENCE_AUDIO_MODEL_ID",
+                "nii-yamagishilab/wav2vec-large-anti-deepfake-nda",
+            ),
             "facial_landmarks": "mediapipe FaceMesh/FaceLandmarker",
             "temporal_embedding": "facenet-pytorch InceptionResnetV1(vggface2) when available",
         },
         "sstgnn": {
+            "pretrained_hf_models": [],
             "graph_component": "scipy.spatial.Delaunay + MediaPipe landmarks",
         },
+        "explainability": {
+            "gemini_model_candidates": list(MODEL_CANDIDATES),
+        },
         "generator_labels": SUPPORTED_GENERATORS,
     }
     return _model_inventory()
 def _assign_processing_time(results: list[EngineResult], ms: float) -> list[EngineResult]:
     for result in results:
         result.processing_time_ms = round(ms, 2)
         tmp_path = tmp.name
     try:
+        frames = await asyncio.to_thread(extract_video_frames, tmp_path, MAX_FRAMES)
     finally:
         Path(tmp_path).unlink(missing_ok=True)
         tmp_path = tmp.name
     try:
+        frames_task = asyncio.to_thread(extract_video_frames, tmp_path, MAX_FRAMES)
+        audio_task = asyncio.to_thread(extract_audio_waveform, tmp_path, 16000)
+        frames, audio = await asyncio.gather(frames_task, audio_task)
     finally:
         Path(tmp_path).unlink(missing_ok=True)
         raise HTTPException(status_code=422, detail="Could not extract frames")
     await _ensure_models_loaded()
+    audio_waveform = None
+    audio_sample_rate = 16000
+    if audio is not None:
+        audio_waveform, audio_sample_rate = audio
     fp, co, st = await asyncio.gather(
         asyncio.to_thread(_fp.run_video, frames),
+        asyncio.to_thread(_co.run_video, frames, audio_waveform, audio_sample_rate),
         asyncio.to_thread(_st.run_video, frames),
     )

src/engines/coherence/detector.py CHANGED Viewed

@@ -6,8 +6,10 @@ from __future__ import annotations
 import os
 import tempfile
 from src.types import EngineResult
 from .engine import CoherenceEngine
@@ -16,39 +18,26 @@ class CoherenceDetector(CoherenceEngine):
     threshold = 0.5
     def detect_bytes(self, video_bytes: bytes) -> EngineResult:
-        frames = self._extract_video_frames(video_bytes)
         if not frames:
             return self._error_result(0.0)
         try:
-            return self.run_video(frames)
         except Exception:
             return self._error_result(0.0)
-    def _extract_video_frames(self, video_bytes: bytes) -> list:
-        try:
-            import cv2
-        except Exception:
-            return []
         with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
             tmp.write(video_bytes)
             tmp_path = tmp.name
-        frames = []
         try:
-            cap = cv2.VideoCapture(tmp_path)
-            index = 0
-            while True:
-                ok, frame = cap.read()
-                if not ok:
-                    break
-                if index % 2 == 0:
-                    frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-                index += 1
-                if len(frames) >= 64:
-                    break
-            cap.release()
-            return frames
         finally:
             os.unlink(tmp_path)

 import os
 import tempfile
+import numpy as np
 from src.types import EngineResult
+from src.services.media_utils import extract_audio_waveform, extract_video_frames
 from .engine import CoherenceEngine
     threshold = 0.5
     def detect_bytes(self, video_bytes: bytes) -> EngineResult:
+        frames, audio_waveform, audio_sample_rate = self._extract_video_media(video_bytes)
         if not frames:
             return self._error_result(0.0)
         try:
+            return self.run_video(frames, audio_waveform, audio_sample_rate)
         except Exception:
             return self._error_result(0.0)
+    def _extract_video_media(self, video_bytes: bytes) -> tuple[list[np.ndarray], np.ndarray | None, int]:
         with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
             tmp.write(video_bytes)
             tmp_path = tmp.name
         try:
+            frames = extract_video_frames(tmp_path, max_frames=64)
+            audio = extract_audio_waveform(tmp_path, sample_rate=16000)
+            if audio is None:
+                return frames, None, 16000
+            waveform, sample_rate = audio
+            return frames, waveform, sample_rate
         finally:
             os.unlink(tmp_path)

src/engines/coherence/engine.py CHANGED Viewed

@@ -21,7 +21,7 @@ _mtcnn = None
 _resnet = None
 _face_mesh = None
 _torch = None
-_hf_detector = None
 def _skip_model_loads() -> bool:
@@ -88,8 +88,8 @@ def _build_face_mesh():
             static_image_mode=False,
             max_num_faces=1,
             refine_landmarks=True,
-        min_detection_confidence=0.5,
-    )
     from mediapipe.tasks import python as mp_tasks_python  # type: ignore
     from mediapipe.tasks.python import vision  # type: ignore
@@ -104,22 +104,30 @@ def _build_face_mesh():
     return _TasksFaceMeshAdapter(mp, landmarker)
-def _build_image_classifier(model_id: str) -> Any:
     pipeline = _get_pipeline()
     cache_dir = os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
     try:
         return pipeline(
-            "image-classification",
             model=model_id,
-            model_kwargs={"cache_dir": cache_dir},
         )
     except Exception:
-        return pipeline("image-classification", model=model_id)
 def _load() -> None:
-    global _mtcnn, _resnet, _face_mesh, _load_attempted, _torch, _hf_detector
     if _load_attempted:
         return
@@ -152,10 +160,13 @@ def _load() -> None:
         logger.warning("Coherence embedding model load failed, using heuristic-only mode: %s", exc)
     try:
-        model_id = os.environ.get("COHERENCE_HF_MODEL_ID", "Wvolf/ViT_Deepfake_Detection")
-        _hf_detector = _build_image_classifier(model_id)
     except Exception as exc:
-        logger.warning("Coherence HF fallback model unavailable: %s", exc)
     logger.info("Coherence model load attempt complete")
@@ -171,7 +182,6 @@ class CoherenceEngine:
         frame = np.array(image.convert("RGB"))
         score = self._image_score(frame)
-        score = float(np.clip(score * 0.6 + self._hf_image_score(image) * 0.4, 0.0, 1.0))
         return EngineResult(
             engine="coherence",
@@ -214,7 +224,12 @@ class CoherenceEngine:
             logger.warning("Coherence image scoring failed: %s", exc)
             return 0.35
-    def run_video(self, frames: list[np.ndarray]) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
@@ -236,9 +251,8 @@ class CoherenceEngine:
         delta = self._embedding_variance(frames)
         jerk = self._landmark_jerk(frames)
         blink = self._blink_anomaly(frames)
-        hf_video = self._hf_video_score(frames)
-        score = float(np.clip(delta * 0.35 + jerk * 0.30 + blink * 0.15 + hf_video * 0.20, 0.0, 1.0))
         return EngineResult(
             engine="coherence",
@@ -249,44 +263,45 @@ class CoherenceEngine:
                 f"Embedding variance {delta:.2f}, "
                 f"landmark jerk {jerk:.2f}, "
                 f"blink anomaly {blink:.2f}, "
-                f"hf score {hf_video:.2f}."
             ),
             processing_time_ms=(time.perf_counter() - t0) * 1000,
         )
-    def _hf_image_score(self, image: Image.Image) -> float:
-        if _hf_detector is None:
             return 0.5
-        try:
-            preds = _hf_detector(image)
-            return self._fake_score_from_preds(preds)
-        except Exception:
             return 0.5
-    def _hf_video_score(self, frames: list[np.ndarray]) -> float:
-        if _hf_detector is None or not frames:
-            return 0.5
-        values: list[float] = []
-        for frame in frames[::8]:
-            try:
-                preds = _hf_detector(Image.fromarray(frame))
-                values.append(self._fake_score_from_preds(preds))
-            except Exception:
-                continue
-        if not values:
             return 0.5
-        return float(np.clip(np.mean(values), 0.0, 1.0))
-    def _fake_score_from_preds(self, preds: list[dict]) -> float:
         if not preds:
             return 0.5
-        keywords = ("fake", "deepfake", "generated", "synthetic", "ai", "artificial")
         best = 0.0
         for pred in preds:
             label = str(pred.get("label", "")).lower()
             score = float(pred.get("score", 0.0))
-            if any(keyword in label for keyword in keywords):
                 best = max(best, score)
         if best == 0.0:
             return 0.5
         return float(np.clip(best, 0.0, 1.0))

 _resnet = None
 _face_mesh = None
 _torch = None
+_audio_detector = None
 def _skip_model_loads() -> bool:
             static_image_mode=False,
             max_num_faces=1,
             refine_landmarks=True,
+            min_detection_confidence=0.5,
+        )
     from mediapipe.tasks import python as mp_tasks_python  # type: ignore
     from mediapipe.tasks.python import vision  # type: ignore
     return _TasksFaceMeshAdapter(mp, landmarker)
+def _build_audio_classifier(model_id: str) -> Any:
     pipeline = _get_pipeline()
     cache_dir = os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
     try:
         return pipeline(
+            "audio-classification",
             model=model_id,
+            trust_remote_code=True,
+            model_kwargs={"cache_dir": cache_dir, "trust_remote_code": True},
         )
     except Exception:
+        try:
+            return pipeline(
+                "audio-classification",
+                model=model_id,
+                model_kwargs={"cache_dir": cache_dir},
+            )
+        except Exception:
+            return pipeline("audio-classification", model=model_id)
 def _load() -> None:
+    global _mtcnn, _resnet, _face_mesh, _load_attempted, _torch, _audio_detector
     if _load_attempted:
         return
         logger.warning("Coherence embedding model load failed, using heuristic-only mode: %s", exc)
     try:
+        model_id = os.environ.get(
+            "COHERENCE_AUDIO_MODEL_ID",
+            "nii-yamagishilab/wav2vec-large-anti-deepfake-nda",
+        )
+        _audio_detector = _build_audio_classifier(model_id)
     except Exception as exc:
+        logger.warning("Coherence audio model unavailable: %s", exc)
     logger.info("Coherence model load attempt complete")
         frame = np.array(image.convert("RGB"))
         score = self._image_score(frame)
         return EngineResult(
             engine="coherence",
             logger.warning("Coherence image scoring failed: %s", exc)
             return 0.35
+    def run_video(
+        self,
+        frames: list[np.ndarray],
+        audio_waveform: np.ndarray | None = None,
+        audio_sample_rate: int = 16000,
+    ) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
         delta = self._embedding_variance(frames)
         jerk = self._landmark_jerk(frames)
         blink = self._blink_anomaly(frames)
+        audio = self._audio_deepfake_score(audio_waveform, audio_sample_rate)
+        score = float(np.clip(delta * 0.35 + jerk * 0.30 + blink * 0.15 + audio * 0.20, 0.0, 1.0))
         return EngineResult(
             engine="coherence",
                 f"Embedding variance {delta:.2f}, "
                 f"landmark jerk {jerk:.2f}, "
                 f"blink anomaly {blink:.2f}, "
+                f"audio deepfake score {audio:.2f}."
             ),
             processing_time_ms=(time.perf_counter() - t0) * 1000,
         )
+    def _audio_deepfake_score(self, waveform: np.ndarray | None = None, sample_rate: int = 16000) -> float:
+        if _audio_detector is None:
             return 0.5
+        if waveform is None or waveform.size == 0:
             return 0.5
+        max_seconds = int(os.environ.get("COHERENCE_AUDIO_MAX_SECONDS", "30"))
+        max_samples = max(16000, sample_rate * max_seconds)
+        if waveform.size > max_samples:
+            waveform = waveform[:max_samples]
+        try:
+            preds = _audio_detector(
+                {"array": waveform.astype(np.float32), "sampling_rate": sample_rate},
+                top_k=5,
+            )
+        except Exception:
             return 0.5
+        if isinstance(preds, dict):
+            preds = [preds]
+        if preds and isinstance(preds[0], list):
+            preds = preds[0]
         if not preds:
             return 0.5
+        fake_keywords = ("spoof", "fake", "deepfake", "synthetic", "generated")
         best = 0.0
         for pred in preds:
             label = str(pred.get("label", "")).lower()
             score = float(pred.get("score", 0.0))
+            if any(keyword in label for keyword in fake_keywords):
                 best = max(best, score)
         if best == 0.0:
             return 0.5
         return float(np.clip(best, 0.0, 1.0))

src/engines/fingerprint/engine.py CHANGED Viewed

@@ -31,9 +31,9 @@ _FAKE_KEYWORDS = ("artificial", "fake", "ai", "generated", "deepfake", "syntheti
 _lock = threading.Lock()
 _load_attempted = False
-_detector = None
 _clip_zeroshot = None
-_backup = None
 def _skip_model_loads() -> bool:
@@ -61,10 +61,18 @@ def _build_image_classifier(model_id: str) -> Any:
         return pipeline(
             "image-classification",
             model=model_id,
-            model_kwargs={"cache_dir": CACHE},
         )
     except Exception:
-        return pipeline("image-classification", model=model_id)
 def _build_zero_shot_image_classifier(model_id: str) -> Any:
@@ -74,14 +82,22 @@ def _build_zero_shot_image_classifier(model_id: str) -> Any:
         return pipeline(
             "zero-shot-image-classification",
             model=model_id,
-            model_kwargs={"cache_dir": CACHE},
         )
     except Exception:
-        return pipeline("zero-shot-image-classification", model=model_id)
 def _load() -> None:
-    global _detector, _clip_zeroshot, _backup, _load_attempted
     if _load_attempted:
         return
@@ -93,13 +109,42 @@ def _load() -> None:
     logger.info("Loading fingerprint models...")
     try:
-        _detector = _build_image_classifier("Organika/sdxl-detector")
-        _clip_zeroshot = _build_zero_shot_image_classifier("openai/clip-vit-large-patch14")
-        try:
-            _backup = _build_image_classifier("haywoodsloan/ai-image-detector-deploy")
-        except Exception:
-            logger.warning("Backup fingerprint detector unavailable")
     except Exception as exc:
         logger.warning("Fingerprint models unavailable: %s", exc)
@@ -136,18 +181,18 @@ class FingerprintEngine:
             image = image.convert("RGB")
         fake_score = 0.5
-        try:
-            if _detector is not None:
-                fake_score = _fake_score(_detector(image))
-        except Exception as exc:
-            logger.warning("Primary detector error: %s", exc)
-        if _backup is not None:
             try:
-                backup_score = _fake_score(_backup(image))
-                fake_score = float(np.clip(fake_score * 0.6 + backup_score * 0.4, 0.0, 1.0))
-            except Exception:
-                pass
         generator = "real"
         try:

 _lock = threading.Lock()
 _load_attempted = False
+_detectors: list[Any] = []
+_detector_weights: list[float] = []
 _clip_zeroshot = None
 def _skip_model_loads() -> bool:
         return pipeline(
             "image-classification",
             model=model_id,
+            trust_remote_code=True,
+            model_kwargs={"cache_dir": CACHE, "trust_remote_code": True},
         )
     except Exception:
+        try:
+            return pipeline(
+                "image-classification",
+                model=model_id,
+                model_kwargs={"cache_dir": CACHE},
+            )
+        except Exception:
+            return pipeline("image-classification", model=model_id)
 def _build_zero_shot_image_classifier(model_id: str) -> Any:
         return pipeline(
             "zero-shot-image-classification",
             model=model_id,
+            trust_remote_code=True,
+            model_kwargs={"cache_dir": CACHE, "trust_remote_code": True},
         )
     except Exception:
+        try:
+            return pipeline(
+                "zero-shot-image-classification",
+                model=model_id,
+                model_kwargs={"cache_dir": CACHE},
+            )
+        except Exception:
+            return pipeline("zero-shot-image-classification", model=model_id)
 def _load() -> None:
+    global _detectors, _detector_weights, _clip_zeroshot, _load_attempted
     if _load_attempted:
         return
     logger.info("Loading fingerprint models...")
     try:
+        configured_models = [
+            model_id.strip()
+            for model_id in os.environ.get(
+                "FINGERPRINT_MODEL_IDS",
+                (
+                    "yermandy/deepfake-detection,"
+                    "yermandy/GenD_CLIP_L_14,"
+                    "yermandy/GenD_DINOv3_L,"
+                    "Wvolf/ViT_Deepfake_Detection,"
+                    "prithivMLmods/Deep-Fake-Detector-v2-Model,"
+                    "Smogy/SMOGY-Ai-images-detector"
+                ),
+            ).split(",")
+            if model_id.strip()
+        ]
+        configured_weights = [
+            value.strip()
+            for value in os.environ.get(
+                "FINGERPRINT_MODEL_WEIGHTS",
+                "1.4,1.4,1.1,1.0,1.0,0.9",
+            ).split(",")
+            if value.strip()
+        ]
+        for index, model_id in enumerate(configured_models):
+            try:
+                _detectors.append(_build_image_classifier(model_id))
+                try:
+                    _detector_weights.append(float(configured_weights[index]))
+                except Exception:
+                    _detector_weights.append(1.0)
+                logger.info("Loaded fingerprint detector: %s", model_id)
+            except Exception as exc:
+                logger.warning("Fingerprint detector unavailable (%s): %s", model_id, exc)
+        _clip_zeroshot = _build_zero_shot_image_classifier("openai/clip-vit-large-patch14")
     except Exception as exc:
         logger.warning("Fingerprint models unavailable: %s", exc)
             image = image.convert("RGB")
         fake_score = 0.5
+        weighted_scores: list[float] = []
+        weight_total = 0.0
+        for idx, detector in enumerate(_detectors):
             try:
+                score = _fake_score(detector(image))
+                weight = _detector_weights[idx] if idx < len(_detector_weights) else 1.0
+                weighted_scores.append(score * max(weight, 0.0))
+                weight_total += max(weight, 0.0)
+            except Exception as exc:
+                logger.warning("Fingerprint detector inference error: %s", exc)
+        if weighted_scores and weight_total > 0.0:
+            fake_score = float(np.clip(sum(weighted_scores) / weight_total, 0.0, 1.0))
         generator = "real"
         try:

src/engines/sstgnn/engine.py CHANGED Viewed

@@ -6,7 +6,6 @@ import threading
 import time
 import urllib.request
 from pathlib import Path
-from typing import Any
 import numpy as np
 from PIL import Image
@@ -14,12 +13,9 @@ from PIL import Image
 from src.types import EngineResult
 logger = logging.getLogger(__name__)
-CACHE = os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
 _lock = threading.Lock()
 _load_attempted = False
-_det1 = None
-_det2 = None
 _mesh = None
 _delaunay = None
@@ -33,14 +29,6 @@ def _skip_model_loads() -> bool:
     }
-def _get_pipeline():
-    try:
-        from transformers import pipeline as hf_pipeline  # type: ignore
-    except Exception:
-        from transformers.pipelines import pipeline as hf_pipeline  # type: ignore
-    return hf_pipeline
 KEYPOINT_STEP = 7
 KEYPOINT_COUNT = 68
@@ -84,20 +72,7 @@ def _ensure_face_landmarker_asset() -> Path:
     return model_path
-def _build_image_classifier(model_id: str) -> Any:
-    pipeline = _get_pipeline()
-    try:
-        return pipeline(
-            "image-classification",
-            model=model_id,
-            model_kwargs={"cache_dir": CACHE},
-        )
-    except Exception:
-        return pipeline("image-classification", model=model_id)
-def _build_face_mesh() -> Any:
     import mediapipe as mp  # type: ignore
     if hasattr(mp, "solutions"):
@@ -121,7 +96,7 @@ def _build_face_mesh() -> Any:
 def _load() -> None:
-    global _det1, _det2, _mesh, _delaunay, _load_attempted
     if _load_attempted:
         return
@@ -132,15 +107,6 @@ def _load() -> None:
     logger.info("Loading SSTGNN models...")
-    try:
-        _det1 = _build_image_classifier("dima806/deepfake_vs_real_image_detection")
-        try:
-            _det2 = _build_image_classifier("prithivMLmods/Deep-Fake-Detector-Model")
-        except Exception:
-            logger.warning("SSTGNN backup detector unavailable")
-    except Exception as exc:
-        logger.warning("SSTGNN HF detector load failed: %s", exc)
     try:
         _mesh = _build_face_mesh()
     except Exception as exc:
@@ -156,19 +122,6 @@ def _load() -> None:
     logger.info("SSTGNN model load attempt complete")
-def _fake_prob(preds: list[dict]) -> float:
-    fake_keywords = ("fake", "deepfake", "artificial", "generated", "ai", "synthetic")
-    best = 0.0
-    for pred in preds:
-        label = str(pred.get("label", "")).lower()
-        score = float(pred.get("score", 0.0))
-        if any(keyword in label for keyword in fake_keywords):
-            best = max(best, score)
-    if best == 0.0:
-        return 0.5
-    return float(np.clip(best, 0.0, 1.0))
 class SSTGNNEngine:
     def _ensure(self) -> None:
         with _lock:
@@ -181,39 +134,15 @@ class SSTGNNEngine:
         if image.mode != "RGB":
             image = image.convert("RGB")
-        scores: list[float] = []
-        try:
-            if _det1 is not None:
-                scores.append(_fake_prob(_det1(image)) * 0.6)
-        except Exception as exc:
-            logger.warning("SSTGNN det1 error: %s", exc)
-        if _det2 is not None:
-            try:
-                scores.append(_fake_prob(_det2(image)) * 0.4)
-            except Exception as exc:
-                logger.warning("SSTGNN det2 error: %s", exc)
-        if not scores:
-            return EngineResult(
-                engine="sstgnn",
-                verdict="REAL",
-                confidence=0.5,
-                attributed_generator=None,
-                explanation="All detectors failed; returning neutral score.",
-                processing_time_ms=(time.perf_counter() - t0) * 1000,
-            )
-        cnn = sum(scores) / (0.6 if len(scores) == 1 else 1.0)
         graph = self._geometry_score(np.array(image))
-        final = float(np.clip(cnn * 0.7 + graph * 0.3, 0.0, 1.0))
         return EngineResult(
             engine="sstgnn",
             verdict="FAKE" if final > 0.5 else "REAL",
             confidence=final,
             attributed_generator=None,
-            explanation=f"CNN {cnn:.2f}, geometric graph anomaly {graph:.2f}.",
             processing_time_ms=(time.perf_counter() - t0) * 1000,
         )

 import time
 import urllib.request
 from pathlib import Path
 import numpy as np
 from PIL import Image
 from src.types import EngineResult
 logger = logging.getLogger(__name__)
 _lock = threading.Lock()
 _load_attempted = False
 _mesh = None
 _delaunay = None
     }
 KEYPOINT_STEP = 7
 KEYPOINT_COUNT = 68
     return model_path
+def _build_face_mesh():
     import mediapipe as mp  # type: ignore
     if hasattr(mp, "solutions"):
 def _load() -> None:
+    global _mesh, _delaunay, _load_attempted
     if _load_attempted:
         return
     logger.info("Loading SSTGNN models...")
     try:
         _mesh = _build_face_mesh()
     except Exception as exc:
     logger.info("SSTGNN model load attempt complete")
 class SSTGNNEngine:
     def _ensure(self) -> None:
         with _lock:
         if image.mode != "RGB":
             image = image.convert("RGB")
         graph = self._geometry_score(np.array(image))
+        final = float(np.clip(graph, 0.0, 1.0))
         return EngineResult(
             engine="sstgnn",
             verdict="FAKE" if final > 0.5 else "REAL",
             confidence=final,
             attributed_generator=None,
+            explanation=f"Geometric graph anomaly {graph:.2f}.",
             processing_time_ms=(time.perf_counter() - t0) * 1000,
         )

src/explainability/explainer.py CHANGED Viewed

@@ -26,22 +26,24 @@ SYSTEM_INSTRUCTION = (
     "Output only the explanation text."
 )
-MODEL_CANDIDATES = (
-    # Preferred order: Gemini 3.1 first, then 2.5 and legacy fallbacks.
     "gemini-3.1-pro-preview",
     "gemini-3.1-pro-preview-customtools",
     "gemini-2.5-pro",
     "gemini-2.5-flash",
     "gemini-2.5-flash-lite",
-    # Legacy/compatibility fallbacks.
-    "gemini-2.0-flash",
-    "gemini-1.5-pro",
-    "gemini-1.5-pro-latest",
-    # legacy names kept as last-resort candidates
-    "gemini-2.5-pro-preview-03-25",
-    "gemini-1.5-pro-002",
 )
 REQUEST_TIMEOUT_S = float(os.environ.get("GEMINI_REQUEST_TIMEOUT_S", "10"))
 MAX_MODEL_ATTEMPTS = max(1, int(os.environ.get("GEMINI_MAX_MODEL_ATTEMPTS", "3")))
 ENABLE_LEGACY_MODEL_DISCOVERY = os.environ.get("GEMINI_DISCOVER_MODELS", "").strip().lower() in {

     "Output only the explanation text."
 )
+DEFAULT_MODEL_CANDIDATES = (
+    # Source: https://ai.google.dev/gemini-api/docs/models (checked March 2026).
     "gemini-3.1-pro-preview",
     "gemini-3.1-pro-preview-customtools",
+    "gemini-3-flash-preview",
+    "gemini-3.1-flash-lite-preview",
     "gemini-2.5-pro",
     "gemini-2.5-flash",
     "gemini-2.5-flash-lite",
 )
+_configured_candidates = [
+    value.strip()
+    for value in os.environ.get("GEMINI_MODEL_CANDIDATES", "").split(",")
+    if value.strip()
+]
+MODEL_CANDIDATES = tuple(_configured_candidates) if _configured_candidates else DEFAULT_MODEL_CANDIDATES
 REQUEST_TIMEOUT_S = float(os.environ.get("GEMINI_REQUEST_TIMEOUT_S", "10"))
 MAX_MODEL_ATTEMPTS = max(1, int(os.environ.get("GEMINI_MAX_MODEL_ATTEMPTS", "3")))
 ENABLE_LEGACY_MODEL_DISCOVERY = os.environ.get("GEMINI_DISCOVER_MODELS", "").strip().lower() in {

src/services/media_utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from __future__ import annotations
+import logging
+import subprocess
+import tempfile
+import wave
+from pathlib import Path
+import numpy as np
+logger = logging.getLogger(__name__)
+def extract_video_frames(video_path: str | Path, max_frames: int = 300) -> list[np.ndarray]:
+    try:
+        import cv2  # type: ignore
+    except Exception as exc:
+        raise RuntimeError(f"OpenCV unavailable: {exc}") from exc
+    path = str(Path(video_path))
+    cap = cv2.VideoCapture(path)
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    step = max(1, total // max_frames) if total > 0 else 1
+    frames: list[np.ndarray] = []
+    index = 0
+    while True:
+        ok, frame = cap.read()
+        if not ok:
+            break
+        if index % step == 0:
+            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        index += 1
+        if len(frames) >= max_frames:
+            break
+    cap.release()
+    return frames
+def extract_audio_waveform(
+    video_path: str | Path,
+    sample_rate: int = 16000,
+) -> tuple[np.ndarray, int] | None:
+    path = Path(video_path)
+    if not path.exists():
+        return None
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        wav_path = Path(tmp.name)
+    cmd = [
+        "ffmpeg",
+        "-nostdin",
+        "-loglevel",
+        "error",
+        "-y",
+        "-i",
+        str(path),
+        "-vn",
+        "-ac",
+        "1",
+        "-ar",
+        str(sample_rate),
+        str(wav_path),
+    ]
+    try:
+        subprocess.run(cmd, check=True, capture_output=True)
+    except Exception as exc:
+        logger.warning("Audio extraction failed via ffmpeg: %s", exc)
+        wav_path.unlink(missing_ok=True)
+        return None
+    try:
+        with wave.open(str(wav_path), "rb") as wav_file:
+            channels = wav_file.getnchannels()
+            sr = wav_file.getframerate()
+            sampwidth = wav_file.getsampwidth()
+            pcm = wav_file.readframes(wav_file.getnframes())
+    except Exception as exc:
+        logger.warning("Could not read extracted WAV file: %s", exc)
+        wav_path.unlink(missing_ok=True)
+        return None
+    finally:
+        wav_path.unlink(missing_ok=True)
+    if not pcm:
+        return None
+    if sampwidth == 1:
+        arr = np.frombuffer(pcm, dtype=np.uint8).astype(np.float32)
+        arr = (arr - 128.0) / 128.0
+    elif sampwidth == 2:
+        arr = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
+    elif sampwidth == 4:
+        arr = np.frombuffer(pcm, dtype=np.int32).astype(np.float32) / 2147483648.0
+    else:
+        logger.warning("Unsupported audio sample width: %s", sampwidth)
+        return None
+    if channels > 1:
+        arr = arr.reshape(-1, channels).mean(axis=1)
+    arr = np.clip(arr, -1.0, 1.0).astype(np.float32)
+    if arr.size == 0:
+        return None
+    return arr, sr