Spaces:

akagtag
/

deepdetection

Paused

App Files Files Community

akagtag commited on 24 days ago

Commit

337b262

1 Parent(s): 19d9b40

Fix ZeroGPU model loading lifecycle

Browse files

Files changed (5) hide show

requirements.txt +1 -0
src/api/main.py +12 -0
src/engines/coherence/engine.py +117 -70
src/engines/fingerprint/engine.py +163 -138
src/engines/sstgnn/engine.py +115 -104

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 gradio==5.23.0
 fastapi>=0.111.0
 uvicorn[standard]>=0.29.0
 python-multipart>=0.0.9

 gradio==5.23.0
+spaces>=0.30.2
 fastapi>=0.111.0
 uvicorn[standard]>=0.29.0
 python-multipart>=0.0.9

src/api/main.py CHANGED Viewed

@@ -69,6 +69,10 @@ if _is_test_mode():
     os.environ.setdefault("GENAI_SKIP_MODEL_LOAD", "1")
 app = FastAPI(title="GenAI-DeepDetect", version="1.0.0")
 app.add_middleware(
     CORSMiddleware,
@@ -294,6 +298,14 @@ async def preload() -> None:
         logger.info("Skipping startup preload in test mode")
         return
     logger.info("Preloading models...")
     # Keep model imports/loads sequential to avoid lazy-import race issues.
     await asyncio.to_thread(_fp._ensure)

     os.environ.setdefault("GENAI_SKIP_MODEL_LOAD", "1")
+def _is_zero_gpu_space() -> bool:
+    return os.environ.get("SPACE_ID", "").startswith("akagtag/")
 app = FastAPI(title="GenAI-DeepDetect", version="1.0.0")
 app.add_middleware(
     CORSMiddleware,
         logger.info("Skipping startup preload in test mode")
         return
+    if _is_zero_gpu_space():
+        logger.info("Skipping startup preload on ZeroGPU; local models load inside @spaces.GPU calls")
+        return
+    if get_inference_backend() in {"hf", "runpod"}:
+        logger.info("Skipping startup preload for remote inference backend")
+        return
     logger.info("Preloading models...")
     # Keep model imports/loads sequential to avoid lazy-import race issues.
     await asyncio.to_thread(_fp._ensure)

src/engines/coherence/engine.py CHANGED Viewed

@@ -13,11 +13,6 @@ from typing import Optional
 import numpy as np
 from PIL import Image
-try:
-    import spaces  # type: ignore  # noqa: F401
-except ImportError:
-    spaces = None
 from src.types import EngineResult
 logger = logging.getLogger(__name__)
@@ -28,15 +23,11 @@ _mtcnn = None
 _resnet = None
 _face_mesh = None
 _torch = None
-_device = "cpu"  # updated to "cuda" in _load() when GPU is available
 _resnet_fallback = None   # torchvision ResNet-18 used when facenet-pytorch unavailable
 _transform_fallback = None
-def _prefer_cuda(torch_module) -> bool:
-    return torch_module.cuda.is_available() or os.environ.get("SPACE_ID", "").startswith("akagtag/")
 def _skip_model_loads() -> bool:
     return os.environ.get("GENAI_SKIP_MODEL_LOAD", "").strip().lower() in {
         "1",
@@ -139,7 +130,7 @@ def _load() -> None:
         import torch  # type: ignore
         _torch = torch
-        _device = "cuda" if _prefer_cuda(torch) else "cpu"
         logger.info("  Coherence device: %s", _device)
         from facenet_pytorch import InceptionResnetV1, MTCNN  # type: ignore
@@ -159,7 +150,7 @@ def _load() -> None:
             import torchvision.transforms as tv_transforms  # type: ignore
             _torch = torch
-            _device = "cuda" if _prefer_cuda(torch) else "cpu"
             model = tv_models.resnet18(weights=tv_models.ResNet18_Weights.DEFAULT)
             model.fc = torch.nn.Identity()  # strip classifier → 512-d embedding
@@ -183,6 +174,47 @@ def _load() -> None:
     logger.info("Coherence model load attempt complete")
 class CoherenceEngine:
     def _ensure(self) -> None:
         with _lock:
@@ -191,18 +223,22 @@ class CoherenceEngine:
     def run(self, image: Image.Image) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
-        frame = np.array(image.convert("RGB"))
-        score = self._image_score(frame)
-        return EngineResult(
-            engine="coherence",
-            verdict="FAKE" if score > 0.5 else "REAL",
-            confidence=float(np.clip(score, 0.0, 1.0)),
-            attributed_generator=None,
-            explanation=f"Geometric coherence anomaly {score:.2f} (image mode).",
-            processing_time_ms=(time.perf_counter() - t0) * 1000,
-        )
     def _image_score(self, frame: np.ndarray) -> float:
         if _face_mesh is None:
@@ -252,58 +288,69 @@ class CoherenceEngine:
         """
         t0 = time.perf_counter()
         self._ensure()
-        if not frames:
             return EngineResult(
                 engine="coherence",
-                verdict="UNKNOWN",
-                confidence=0.5,
                 attributed_generator=None,
-                explanation="No frames.",
-                processing_time_ms=0.0,
             )
-        if len(frames) < 4:
-            result = self.run(Image.fromarray(frames[0]))
-            result.explanation = "Too few frames for temporal analysis."
-            return result
-        delta = self._embedding_variance(frames)
-        jerk = self._landmark_jerk(frames)
-        blink = self._blink_anomaly(frames)
-        visual_score = float(np.clip(delta * 0.45 + jerk * 0.35 + blink * 0.20, 0.0, 1.0))
-        audio_anomaly: Optional[float] = None
-        timestamp_markers: list[dict] = []
-        if video_path is not None:
-            audio_anomaly, timestamp_markers = self._audio_lipsync_score(video_path, frames)
-        if audio_anomaly is not None:
-            score = float(np.clip(visual_score * 0.60 + audio_anomaly * 0.40, 0.0, 1.0))
-            explanation = (
-                f"Embedding variance {delta:.2f}, landmark jerk {jerk:.2f}, "
-                f"blink anomaly {blink:.2f}. "
-                f"Audio lip-sync anomaly {audio_anomaly:.2f} "
-                f"({len(timestamp_markers)} flagged segment(s))."
-            )
-        else:
-            score = visual_score
-            explanation = (
-                f"Embedding variance {delta:.2f}, "
-                f"landmark jerk {jerk:.2f}, "
-                f"blink anomaly {blink:.2f}."
-            )
-        return EngineResult(
-            engine="coherence",
-            verdict="FAKE" if score > 0.5 else "REAL",
-            confidence=score,
-            attributed_generator=None,
-            explanation=explanation,
-            processing_time_ms=(time.perf_counter() - t0) * 1000,
-            audio_sync_score=audio_anomaly,
-            timestamp_markers=timestamp_markers,
-        )
     def _audio_lipsync_score(
         self,

 import numpy as np
 from PIL import Image
 from src.types import EngineResult
 logger = logging.getLogger(__name__)
 _resnet = None
 _face_mesh = None
 _torch = None
+_device = "cpu"
 _resnet_fallback = None   # torchvision ResNet-18 used when facenet-pytorch unavailable
 _transform_fallback = None
 def _skip_model_loads() -> bool:
     return os.environ.get("GENAI_SKIP_MODEL_LOAD", "").strip().lower() in {
         "1",
         import torch  # type: ignore
         _torch = torch
+        _device = "cpu"
         logger.info("  Coherence device: %s", _device)
         from facenet_pytorch import InceptionResnetV1, MTCNN  # type: ignore
             import torchvision.transforms as tv_transforms  # type: ignore
             _torch = torch
+            _device = "cpu"
             model = tv_models.resnet18(weights=tv_models.ResNet18_Weights.DEFAULT)
             model.fc = torch.nn.Identity()  # strip classifier → 512-d embedding
     logger.info("Coherence model load attempt complete")
+def _inference_device() -> str:
+    if _torch is None:
+        return "cpu"
+    try:
+        return "cuda" if _torch.cuda.is_available() else "cpu"
+    except Exception:
+        return "cpu"
+def _prepare_runtime(device: str) -> None:
+    global _device
+    _device = device
+    if device != "cuda":
+        return
+    if _resnet is not None:
+        _resnet.to(device)
+    if _resnet_fallback is not None:
+        _resnet_fallback.to(device)
+def _release_runtime(device: str) -> None:
+    global _device
+    _device = "cpu"
+    if device != "cuda" or _torch is None:
+        return
+    if _resnet is not None:
+        try:
+            _resnet.to("cpu")
+        except Exception:
+            pass
+    if _resnet_fallback is not None:
+        try:
+            _resnet_fallback.to("cpu")
+        except Exception:
+            pass
+    try:
+        _torch.cuda.empty_cache()
+    except Exception:
+        pass
 class CoherenceEngine:
     def _ensure(self) -> None:
         with _lock:
     def run(self, image: Image.Image) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
+        device = _inference_device()
+        _prepare_runtime(device)
+        try:
+            frame = np.array(image.convert("RGB"))
+            score = self._image_score(frame)
+            return EngineResult(
+                engine="coherence",
+                verdict="FAKE" if score > 0.5 else "REAL",
+                confidence=float(np.clip(score, 0.0, 1.0)),
+                attributed_generator=None,
+                explanation=f"Geometric coherence anomaly {score:.2f} (image mode).",
+                processing_time_ms=(time.perf_counter() - t0) * 1000,
+            )
+        finally:
+            _release_runtime(device)
     def _image_score(self, frame: np.ndarray) -> float:
         if _face_mesh is None:
         """
         t0 = time.perf_counter()
         self._ensure()
+        device = _inference_device()
+        _prepare_runtime(device)
+        try:
+            if not frames:
+                return EngineResult(
+                    engine="coherence",
+                    verdict="UNKNOWN",
+                    confidence=0.5,
+                    attributed_generator=None,
+                    explanation="No frames.",
+                    processing_time_ms=0.0,
+                )
+            if len(frames) < 4:
+                score = self._image_score(frames[0])
+                return EngineResult(
+                    engine="coherence",
+                    verdict="FAKE" if score > 0.5 else "REAL",
+                    confidence=float(np.clip(score, 0.0, 1.0)),
+                    attributed_generator=None,
+                    explanation="Too few frames for temporal analysis.",
+                    processing_time_ms=(time.perf_counter() - t0) * 1000,
+                )
+            delta = self._embedding_variance(frames)
+            jerk = self._landmark_jerk(frames)
+            blink = self._blink_anomaly(frames)
+            visual_score = float(np.clip(delta * 0.45 + jerk * 0.35 + blink * 0.20, 0.0, 1.0))
+            audio_anomaly: Optional[float] = None
+            timestamp_markers: list[dict] = []
+            if video_path is not None:
+                audio_anomaly, timestamp_markers = self._audio_lipsync_score(video_path, frames)
+            if audio_anomaly is not None:
+                score = float(np.clip(visual_score * 0.60 + audio_anomaly * 0.40, 0.0, 1.0))
+                explanation = (
+                    f"Embedding variance {delta:.2f}, landmark jerk {jerk:.2f}, "
+                    f"blink anomaly {blink:.2f}. "
+                    f"Audio lip-sync anomaly {audio_anomaly:.2f} "
+                    f"({len(timestamp_markers)} flagged segment(s))."
+                )
+            else:
+                score = visual_score
+                explanation = (
+                    f"Embedding variance {delta:.2f}, "
+                    f"landmark jerk {jerk:.2f}, "
+                    f"blink anomaly {blink:.2f}."
+                )
             return EngineResult(
                 engine="coherence",
+                verdict="FAKE" if score > 0.5 else "REAL",
+                confidence=score,
                 attributed_generator=None,
+                explanation=explanation,
+                processing_time_ms=(time.perf_counter() - t0) * 1000,
+                audio_sync_score=audio_anomaly,
+                timestamp_markers=timestamp_markers,
             )
+        finally:
+            _release_runtime(device)
     def _audio_lipsync_score(
         self,

src/engines/fingerprint/engine.py CHANGED Viewed

@@ -17,24 +17,11 @@ import torch
 from PIL import Image
 from transformers import CLIPModel, CLIPProcessor
-try:
-    import spaces  # type: ignore  # noqa: F401
-except ImportError:
-    spaces = None
 from src.types import EngineResult
 logger = logging.getLogger(__name__)
 CACHE = os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
-def _prefer_cuda() -> bool:
-    return torch.cuda.is_available() or os.environ.get("SPACE_ID", "").startswith("akagtag/")
-# GPU device selection — ZeroGPU emulates CUDA outside the decorated section.
-_DEVICE = "cuda" if _prefer_cuda() else "cpu"
-_PIPELINE_DEVICE = 0 if _DEVICE == "cuda" else -1  # HF pipeline convention
 DETECTOR_CANDIDATES = [
     "Organika/sdxl-detector",
     "haywoodsloan/ai-image-detector-deploy",
@@ -43,14 +30,14 @@ DETECTOR_CANDIDATES = [
 ]
 GENERATOR_PROMPTS: dict[str, str] = {
-    "real":               "photograph with natural film grain, uneven organic noise, authentic lens distortion, and real-world lighting imperfections",
-    "sora":               "AI video frame with unnaturally smooth temporal transitions, photorealistic but physically implausible motion, and over-consistent lighting",
-    "runway":             "AI video frame with painterly color grading artifacts, dreamlike motion blur inconsistencies, and synthetic depth-of-field",
-    "wav2lip":            "face with sharp unnatural lip boundary artifacts, texture discontinuity around the mouth region, and mismatched skin tone at lip edges",
-    "stable_diffusion":   "image with soft overly-smooth skin, color bleeding at object edges, dreamlike over-saturation, and repeating background texture patterns",
-    "sdxl":               "image with hyper-sharp commercial detail, perfect noise-free skin, unnaturally crisp edges, and over-rendered textures lacking real-world imperfection",
-    "midjourney":         "image with dramatic cinematic vignette, fantasy color palette, exaggerated contrast, hyper-detailed surreal aesthetic, and painterly over-rendering",
-    "dall_e":             "image with clean flat graphic style, smooth AI-blended gradients, slightly plastic surface quality, and uniformly lit commercial illustration look",
     "unknown_generative": "image with subtle AI artifacts including unnatural smoothness, inconsistent frequency patterns, and synthetic pixel-level regularities absent in real photos",
 }
@@ -103,10 +90,9 @@ def _short_error(exc: Exception, *, limit: int = 300) -> str:
 def _build_detector(model_id: str) -> Any:
     hf_pipeline = _get_pipeline()
-    # Try GPU first, fall back to CPU-only variants
-    attempts: tuple[dict, ...] = (
-        {"cache_dir": CACHE, "device": _PIPELINE_DEVICE},
-        {"device": _PIPELINE_DEVICE},
         {"cache_dir": CACHE},
         {},
     )
@@ -126,12 +112,12 @@ def _load() -> None:
     if _loaded:
         return
-    logger.info("Fingerprint engine: loading models on device=%s ...", _DEVICE)
     for model_id in DETECTOR_CANDIDATES:
         try:
-            det = _build_detector(model_id)
-            _detectors.append((model_id, det))
             logger.info("  detector loaded: %s", model_id)
         except Exception as exc:
             logger.warning("  detector unavailable (%s): %s", model_id, _short_error(exc))
@@ -140,31 +126,74 @@ def _load() -> None:
         logger.error("Fingerprint engine: no detectors loaded; using neutral fallback score.")
     try:
-        # Load CLIP in FP16 on CUDA for ~2× speed + half memory on A100
-        dtype = torch.float16 if _DEVICE == "cuda" else torch.float32
         _clip_model = CLIPModel.from_pretrained(
             "openai/clip-vit-large-patch14",
             cache_dir=CACHE,
-            torch_dtype=dtype,
-        ).to(_DEVICE)
         _clip_processor = CLIPProcessor.from_pretrained(
             "openai/clip-vit-large-patch14",
             cache_dir=CACHE,
         )
         _clip_model.eval()
-        logger.info("  CLIP loaded on %s (dtype=%s)", _DEVICE, dtype)
     except Exception as exc:
         logger.warning("  CLIP unavailable: %s", _short_error(exc))
     _loaded = True
     logger.info(
-        "Fingerprint engine ready: %s detectors, CLIP=%s, device=%s",
         len(_detectors),
         "ok" if _clip_model else "missing",
-        _DEVICE,
     )
 def _fake_score_from_preds(preds: list[dict[str, Any]]) -> float:
     if not preds:
         return 0.5
@@ -197,45 +226,48 @@ class FingerprintEngine:
     def run(self, image: Image.Image) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        detector_weights = [0.4, 0.3, 0.2, 0.1]
-        total_w = 0.0
-        weighted_fake = 0.0
-        for index, (model_id, det) in enumerate(_detectors):
-            try:
-                preds = det(image)
-                score = _fake_score_from_preds(preds)
-                weight = detector_weights[index] if index < len(detector_weights) else 0.1
-                weighted_fake += score * weight
-                total_w += weight
-                logger.debug("%s fake_score=%.3f", model_id, score)
-            except Exception as exc:
-                logger.warning("Detector %s inference error: %s", model_id, _short_error(exc))
-        ensemble_score = (weighted_fake / total_w) if total_w > 0 else 0.5
-        dct_score = self._dct_frequency_score(image)
-        fake_score = float(np.clip(ensemble_score * 0.85 + dct_score * 0.15, 0.0, 1.0))
-        generator = self._attribute_generator(image, fake_score)
-        return EngineResult(
-            engine="fingerprint",
-            verdict="FAKE" if fake_score > 0.5 else "REAL",
-            confidence=float(fake_score),
-            attributed_generator=generator,
-            explanation=(
-                f"Ensemble {ensemble_score:.2f} × 0.85 + DCT {dct_score:.2f} × 0.15 = {fake_score:.2f}. "
-                f"Generator attributed to: {generator}."
-            ),
-            processing_time_ms=(time.perf_counter() - t0) * 1000,
-        )
-    def _attribute_generator(self, image: Image.Image, fake_score: float) -> str:
         if _clip_model is None or _clip_processor is None:
             _thread_local.last_clip_embedding = None
             return "unknown_generative" if fake_score > 0.5 else "real"
@@ -250,11 +282,10 @@ class FingerprintEngine:
                 truncation=True,
                 max_length=77,
             )
-            # Move all tensors to GPU
-            inputs = {k: v.to(_DEVICE) for k, v in inputs.items()}
             with torch.no_grad():
-                with torch.cuda.amp.autocast(enabled=(_DEVICE == "cuda")):
                     outputs = _clip_model(**inputs)
                     logits = outputs.logits_per_image[0].float()
                     image_embeds = outputs.image_embeds.detach().float().cpu().numpy()[0]
@@ -279,16 +310,13 @@ class FingerprintEngine:
             return "unknown_generative" if fake_score > 0.5 else "real"
     def _batch_clip_attribution(
-        self, images: list[Image.Image], fake_scores: list[float]
     ) -> list[str]:
-        """
-        Single batched CLIP forward pass for all keyframes — far faster than
-        calling _attribute_generator() once per frame on GPU.
-        """
         if _clip_model is None or _clip_processor is None or not images:
-            return [
-                "unknown_generative" if s > 0.5 else "real" for s in fake_scores
-            ]
         try:
             texts = list(GENERATOR_PROMPTS.values())
@@ -300,14 +328,13 @@ class FingerprintEngine:
                 truncation=True,
                 max_length=77,
             )
-            inputs = {k: v.to(_DEVICE) for k, v in inputs.items()}
             with torch.no_grad():
-                with torch.cuda.amp.autocast(enabled=(_DEVICE == "cuda")):
-                    # logits_per_image: (N_images, N_texts)
                     logits = _clip_model(**inputs).logits_per_image.float()
-            probs_batch = logits.softmax(dim=-1).cpu().numpy()  # (N, 9)
             keys = list(GENERATOR_PROMPTS.keys())
             results: list[str] = []
@@ -315,24 +342,22 @@ class FingerprintEngine:
                 probs = probs_batch[i]
                 max_prob = float(np.max(probs))
                 if max_prob < 0.32:
-                    gen = "unknown_generative"
                 else:
-                    gen = keys[int(np.argmax(probs))]
-                if fake_score > 0.65 and gen == "real":
-                    gen = "unknown_generative"
-                if fake_score < 0.35 and gen != "real":
-                    gen = "real"
-                results.append(gen)
             return results
         except Exception as exc:
             logger.warning("Batch CLIP attribution error: %s", _short_error(exc))
-            return [
-                "unknown_generative" if s > 0.5 else "real" for s in fake_scores
-            ]
     def _dct_frequency_score(self, image: Image.Image) -> float:
-        """DCT frequency band analysis (paper §III-B). Runs on CPU (block-level)."""
         try:
             from scipy.fft import dctn  # type: ignore
@@ -363,7 +388,6 @@ class FingerprintEngine:
             return 0.3
     def get_last_clip_embedding(self) -> Optional[np.ndarray]:
-        """Return the CLIP image embedding from the most recent run() call in this thread."""
         return getattr(_thread_local, "last_clip_embedding", None)
     def run_video(self, frames: list) -> EngineResult:
@@ -378,42 +402,43 @@ class FingerprintEngine:
             )
         self._ensure()
-        keyframes = frames[::8] or [frames[0]]
-        keyframes_pil = [
-            Image.fromarray(f).convert("RGB") for f in keyframes
-        ]
-        # Batch detector scores (HF pipeline accepts a list)
-        detector_weights = [0.4, 0.3, 0.2, 0.1]
-        frame_scores: list[float] = []
-        for img in keyframes_pil:
-            total_w = 0.0
-            weighted_fake = 0.0
-            for index, (model_id, det) in enumerate(_detectors):
-                try:
-                    preds = det(img)
-                    score = _fake_score_from_preds(preds)
-                    weight = detector_weights[index] if index < len(detector_weights) else 0.1
-                    weighted_fake += score * weight
-                    total_w += weight
-                except Exception:
-                    pass
-            frame_scores.append((weighted_fake / total_w) if total_w > 0 else 0.5)
-        # Single batched CLIP pass for all keyframes
-        generators = self._batch_clip_attribution(keyframes_pil, frame_scores)
-        avg_conf = float(np.mean(frame_scores))
-        top_gen = max(set(generators), key=generators.count) if generators else "unknown_generative"
-        return EngineResult(
-            engine="fingerprint",
-            verdict="FAKE" if avg_conf > 0.5 else "REAL",
-            confidence=avg_conf,
-            attributed_generator=top_gen,
-            explanation=(
-                f"Keyframe average fake score {avg_conf:.2f} over {len(keyframes)} sampled frames. "
-                f"Dominant generator: {top_gen}."
-            ),
-            processing_time_ms=(time.perf_counter() - t0) * 1000,
-        )

 from PIL import Image
 from transformers import CLIPModel, CLIPProcessor
 from src.types import EngineResult
 logger = logging.getLogger(__name__)
 CACHE = os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
 DETECTOR_CANDIDATES = [
     "Organika/sdxl-detector",
     "haywoodsloan/ai-image-detector-deploy",
 ]
 GENERATOR_PROMPTS: dict[str, str] = {
+    "real": "photograph with natural film grain, uneven organic noise, authentic lens distortion, and real-world lighting imperfections",
+    "sora": "AI video frame with unnaturally smooth temporal transitions, photorealistic but physically implausible motion, and over-consistent lighting",
+    "runway": "AI video frame with painterly color grading artifacts, dreamlike motion blur inconsistencies, and synthetic depth-of-field",
+    "wav2lip": "face with sharp unnatural lip boundary artifacts, texture discontinuity around the mouth region, and mismatched skin tone at lip edges",
+    "stable_diffusion": "image with soft overly-smooth skin, color bleeding at object edges, dreamlike over-saturation, and repeating background texture patterns",
+    "sdxl": "image with hyper-sharp commercial detail, perfect noise-free skin, unnaturally crisp edges, and over-rendered textures lacking real-world imperfection",
+    "midjourney": "image with dramatic cinematic vignette, fantasy color palette, exaggerated contrast, hyper-detailed surreal aesthetic, and painterly over-rendering",
+    "dall_e": "image with clean flat graphic style, smooth AI-blended gradients, slightly plastic surface quality, and uniformly lit commercial illustration look",
     "unknown_generative": "image with subtle AI artifacts including unnatural smoothness, inconsistent frequency patterns, and synthetic pixel-level regularities absent in real photos",
 }
 def _build_detector(model_id: str) -> Any:
     hf_pipeline = _get_pipeline()
+    attempts: tuple[dict[str, Any], ...] = (
+        {"cache_dir": CACHE, "device": -1},
+        {"device": -1},
         {"cache_dir": CACHE},
         {},
     )
     if _loaded:
         return
+    logger.info("Fingerprint engine: loading models on CPU ...")
     for model_id in DETECTOR_CANDIDATES:
         try:
+            detector = _build_detector(model_id)
+            _detectors.append((model_id, detector))
             logger.info("  detector loaded: %s", model_id)
         except Exception as exc:
             logger.warning("  detector unavailable (%s): %s", model_id, _short_error(exc))
         logger.error("Fingerprint engine: no detectors loaded; using neutral fallback score.")
     try:
         _clip_model = CLIPModel.from_pretrained(
             "openai/clip-vit-large-patch14",
             cache_dir=CACHE,
+            torch_dtype=torch.float32,
+        ).to("cpu")
         _clip_processor = CLIPProcessor.from_pretrained(
             "openai/clip-vit-large-patch14",
             cache_dir=CACHE,
         )
         _clip_model.eval()
+        logger.info("  CLIP loaded on cpu")
     except Exception as exc:
         logger.warning("  CLIP unavailable: %s", _short_error(exc))
     _loaded = True
     logger.info(
+        "Fingerprint engine ready: %s detectors, CLIP=%s",
         len(_detectors),
         "ok" if _clip_model else "missing",
     )
+def _inference_device() -> str:
+    try:
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    except Exception:
+        return "cpu"
+def _move_detector(detector: Any, device: str) -> None:
+    model = getattr(detector, "model", None)
+    if model is not None and hasattr(model, "to"):
+        model.to(device)
+    if hasattr(detector, "device"):
+        detector.device = torch.device(device)
+def _prepare_runtime(device: str) -> None:
+    if device != "cuda":
+        return
+    for _, detector in _detectors:
+        try:
+            _move_detector(detector, device)
+        except Exception as exc:
+            logger.warning("Fingerprint detector GPU move failed: %s", _short_error(exc))
+    if _clip_model is not None:
+        _clip_model.to(device)
+def _release_runtime(device: str) -> None:
+    if device != "cuda":
+        return
+    for _, detector in _detectors:
+        try:
+            _move_detector(detector, "cpu")
+        except Exception:
+            pass
+    if _clip_model is not None:
+        try:
+            _clip_model.to("cpu")
+        except Exception:
+            pass
+    try:
+        torch.cuda.empty_cache()
+    except Exception:
+        pass
 def _fake_score_from_preds(preds: list[dict[str, Any]]) -> float:
     if not preds:
         return 0.5
     def run(self, image: Image.Image) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
+        device = _inference_device()
+        _prepare_runtime(device)
+        try:
+            if image.mode != "RGB":
+                image = image.convert("RGB")
+            detector_weights = [0.4, 0.3, 0.2, 0.1]
+            total_w = 0.0
+            weighted_fake = 0.0
+            for index, (model_id, detector) in enumerate(_detectors):
+                try:
+                    preds = detector(image)
+                    score = _fake_score_from_preds(preds)
+                    weight = detector_weights[index] if index < len(detector_weights) else 0.1
+                    weighted_fake += score * weight
+                    total_w += weight
+                    logger.debug("%s fake_score=%.3f", model_id, score)
+                except Exception as exc:
+                    logger.warning("Detector %s inference error: %s", model_id, _short_error(exc))
+            ensemble_score = (weighted_fake / total_w) if total_w > 0 else 0.5
+            dct_score = self._dct_frequency_score(image)
+            fake_score = float(np.clip(ensemble_score * 0.85 + dct_score * 0.15, 0.0, 1.0))
+            generator = self._attribute_generator(image, fake_score, device)
+            return EngineResult(
+                engine="fingerprint",
+                verdict="FAKE" if fake_score > 0.5 else "REAL",
+                confidence=float(fake_score),
+                attributed_generator=generator,
+                explanation=(
+                    f"Ensemble {ensemble_score:.2f} x 0.85 + DCT {dct_score:.2f} x 0.15 = {fake_score:.2f}. "
+                    f"Generator attributed to: {generator}."
+                ),
+                processing_time_ms=(time.perf_counter() - t0) * 1000,
+            )
+        finally:
+            _release_runtime(device)
+    def _attribute_generator(self, image: Image.Image, fake_score: float, device: str) -> str:
         if _clip_model is None or _clip_processor is None:
             _thread_local.last_clip_embedding = None
             return "unknown_generative" if fake_score > 0.5 else "real"
                 truncation=True,
                 max_length=77,
             )
+            inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
+                with torch.cuda.amp.autocast(enabled=(device == "cuda")):
                     outputs = _clip_model(**inputs)
                     logits = outputs.logits_per_image[0].float()
                     image_embeds = outputs.image_embeds.detach().float().cpu().numpy()[0]
             return "unknown_generative" if fake_score > 0.5 else "real"
     def _batch_clip_attribution(
+        self,
+        images: list[Image.Image],
+        fake_scores: list[float],
+        device: str,
     ) -> list[str]:
         if _clip_model is None or _clip_processor is None or not images:
+            return ["unknown_generative" if s > 0.5 else "real" for s in fake_scores]
         try:
             texts = list(GENERATOR_PROMPTS.values())
                 truncation=True,
                 max_length=77,
             )
+            inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
+                with torch.cuda.amp.autocast(enabled=(device == "cuda")):
                     logits = _clip_model(**inputs).logits_per_image.float()
+            probs_batch = logits.softmax(dim=-1).cpu().numpy()
             keys = list(GENERATOR_PROMPTS.keys())
             results: list[str] = []
                 probs = probs_batch[i]
                 max_prob = float(np.max(probs))
                 if max_prob < 0.32:
+                    generator = "unknown_generative"
                 else:
+                    generator = keys[int(np.argmax(probs))]
+                if fake_score > 0.65 and generator == "real":
+                    generator = "unknown_generative"
+                if fake_score < 0.35 and generator != "real":
+                    generator = "real"
+                results.append(generator)
             return results
         except Exception as exc:
             logger.warning("Batch CLIP attribution error: %s", _short_error(exc))
+            return ["unknown_generative" if s > 0.5 else "real" for s in fake_scores]
     def _dct_frequency_score(self, image: Image.Image) -> float:
+        """DCT frequency band analysis (paper section III-B). Runs on CPU."""
         try:
             from scipy.fft import dctn  # type: ignore
             return 0.3
     def get_last_clip_embedding(self) -> Optional[np.ndarray]:
         return getattr(_thread_local, "last_clip_embedding", None)
     def run_video(self, frames: list) -> EngineResult:
             )
         self._ensure()
+        device = _inference_device()
+        _prepare_runtime(device)
+        try:
+            keyframes = frames[::8] or [frames[0]]
+            keyframes_pil = [Image.fromarray(frame).convert("RGB") for frame in keyframes]
+            detector_weights = [0.4, 0.3, 0.2, 0.1]
+            frame_scores: list[float] = []
+            for image in keyframes_pil:
+                total_w = 0.0
+                weighted_fake = 0.0
+                for index, (_, detector) in enumerate(_detectors):
+                    try:
+                        preds = detector(image)
+                        score = _fake_score_from_preds(preds)
+                        weight = detector_weights[index] if index < len(detector_weights) else 0.1
+                        weighted_fake += score * weight
+                        total_w += weight
+                    except Exception:
+                        pass
+                frame_scores.append((weighted_fake / total_w) if total_w > 0 else 0.5)
+            generators = self._batch_clip_attribution(keyframes_pil, frame_scores, device)
+            avg_conf = float(np.mean(frame_scores))
+            top_gen = max(set(generators), key=generators.count) if generators else "unknown_generative"
+            return EngineResult(
+                engine="fingerprint",
+                verdict="FAKE" if avg_conf > 0.5 else "REAL",
+                confidence=avg_conf,
+                attributed_generator=top_gen,
+                explanation=(
+                    f"Keyframe average fake score {avg_conf:.2f} over {len(keyframes)} sampled frames. "
+                    f"Dominant generator: {top_gen}."
+                ),
+                processing_time_ms=(time.perf_counter() - t0) * 1000,
+            )
+        finally:
+            _release_runtime(device)

src/engines/sstgnn/engine.py CHANGED Viewed

@@ -12,24 +12,11 @@ import numpy as np
 import torch
 from PIL import Image
-try:
-    import spaces  # type: ignore  # noqa: F401
-except ImportError:
-    spaces = None
 from src.types import EngineResult
 logger = logging.getLogger(__name__)
 CACHE = os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
-def _prefer_cuda() -> bool:
-    return torch.cuda.is_available() or os.environ.get("SPACE_ID", "").startswith("akagtag/")
-# GPU device selection
-_DEVICE = "cuda" if _prefer_cuda() else "cpu"
-_PIPELINE_DEVICE = 0 if _DEVICE == "cuda" else -1  # HF pipeline convention
 _lock = threading.Lock()
 _load_attempted = False
 _detectors: list[Any] = []
@@ -53,6 +40,9 @@ _FAKE_LABEL_KEYWORDS = (
     "1",
 )
 def _skip_model_loads() -> bool:
     return os.environ.get("GENAI_SKIP_MODEL_LOAD", "").strip().lower() in {
@@ -80,10 +70,9 @@ def _short_error(exc: Exception, *, limit: int = 300) -> str:
 def _build_image_classifier(model_id: str) -> Any:
     pipeline = _get_pipeline()
-    # Try with GPU first, fall back gracefully
-    attempts: tuple[dict, ...] = (
-        {"cache_dir": CACHE, "device": _PIPELINE_DEVICE},
-        {"device": _PIPELINE_DEVICE},
         {"cache_dir": CACHE},
         {},
     )
@@ -119,10 +108,6 @@ def _fake_prob_from_preds(preds: list[dict[str, Any]]) -> float:
     return float(np.clip(fake_best, 0.0, 1.0))
-KEYPOINT_STEP = 7
-KEYPOINT_COUNT = 68
 class _TasksFaceMeshAdapter:
     def __init__(self, mp_module, landmarker) -> None:
         self._mp = mp_module
@@ -195,7 +180,7 @@ def _load() -> None:
         logger.info("Skipping SSTGNN model load (GENAI_SKIP_MODEL_LOAD=1)")
         return
-    logger.info("Loading SSTGNN models on device=%s ...", _DEVICE)
     try:
         configured_models = [
@@ -234,7 +219,46 @@ def _load() -> None:
     except Exception:
         _delaunay = None
-    logger.info("SSTGNN model load attempt complete (device=%s)", _DEVICE)
 class SSTGNNEngine:
@@ -245,27 +269,31 @@ class SSTGNNEngine:
     def run(self, image: Image.Image) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        cnn = self._cnn_score(image)
-        graph = self._geometry_score(np.array(image))
-        if _detectors:
-            final = float(np.clip(cnn * 0.70 + graph * 0.30, 0.0, 1.0))
-            note = f"CNN ensemble {cnn:.2f}; geometric graph anomaly {graph:.2f}."
-        else:
-            final = float(np.clip(graph, 0.0, 1.0))
-            note = f"Geometric graph anomaly {graph:.2f} (cnn fallback unavailable)."
-        return EngineResult(
-            engine="sstgnn",
-            verdict="FAKE" if final > 0.5 else "REAL",
-            confidence=final,
-            attributed_generator=None,
-            explanation=note,
-            processing_time_ms=(time.perf_counter() - t0) * 1000,
-        )
     def _cnn_score(self, image: Image.Image) -> float:
         if not _detectors:
@@ -287,10 +315,6 @@ class SSTGNNEngine:
         return 0.5
     def _batch_cnn_scores(self, images: list[Image.Image]) -> list[float]:
-        """
-        Pass a batch of images through each detector at once — HF pipeline
-        accepts a list and handles batching internally on GPU.
-        """
         if not _detectors or not images:
             return [0.5] * len(images)
@@ -301,7 +325,6 @@ class SSTGNNEngine:
         for index, detector in enumerate(_detectors):
             weight = _detector_weights[index] if index < len(_detector_weights) else 1.0
             try:
-                # Pass the full list — GPU pipeline processes all frames in one batch
                 batch_preds = detector(images)
                 for i, preds in enumerate(batch_preds):
                     score = _fake_prob_from_preds(preds if isinstance(preds, list) else [preds])
@@ -346,16 +369,11 @@ class SSTGNNEngine:
             arr = np.array(areas, dtype=np.float32)
             cv_score = float(np.std(arr) / (np.mean(arr) + 1e-9))
             return float(np.clip((cv_score - 0.8) / 1.5, 0.0, 1.0))
         except Exception as exc:
             logger.warning("Geometry score error: %s", exc)
             return 0.3
     def _temporal_fft_score(self, frames: list[np.ndarray]) -> float:
-        """
-        Pixel-wise 1D FFT over the time axis (paper §III-C / Kim et al. [7]).
-        Uses torch.fft on GPU for ~10× speedup over numpy on A100.
-        """
         try:
             import cv2  # type: ignore
@@ -370,24 +388,23 @@ class SSTGNNEngine:
             gray_stack = np.array(
                 [
                     cv2.resize(
-                        cv2.cvtColor(f, cv2.COLOR_RGB2GRAY)
-                        if (f.ndim == 3 and f.shape[2] >= 3)
-                        else f[:, :, 0] if f.ndim == 3 else f,
                         (32, 32),
                     ).astype(np.float32)
-                    for f in sampled
                 ]
-            )  # shape: (T, 32, 32)
-            if _DEVICE == "cuda":
-                # GPU path: torch.fft on A100 is dramatically faster
-                gray_tensor = torch.from_numpy(gray_stack).to(_DEVICE)  # (T, 32, 32)
-                fft_result = torch.fft.rfft(gray_tensor, dim=0)          # (T//2+1, 32, 32)
                 power = torch.abs(fft_result) ** 2
                 dc_power = power[0].cpu().numpy()
                 total_power = (torch.sum(power, dim=0) + 1e-9).cpu().numpy()
             else:
-                # CPU fallback
                 fft_result = np.fft.rfft(gray_stack, axis=0)
                 power = np.abs(fft_result) ** 2
                 dc_power = power[0]
@@ -395,10 +412,7 @@ class SSTGNNEngine:
             hf_ratio = 1.0 - (dc_power / total_power)
             mean_hf = float(np.mean(hf_ratio))
-            score = float(np.clip(abs(mean_hf - 0.30) / 0.25, 0.0, 1.0))
-            return score
         except Exception as exc:
             logger.warning("Temporal FFT score error: %s", _short_error(exc))
             return 0.3
@@ -406,48 +420,45 @@ class SSTGNNEngine:
     def run_video(self, frames: list[np.ndarray]) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
-        if not frames:
             return EngineResult(
                 engine="sstgnn",
-                verdict="REAL",
-                confidence=0.5,
                 attributed_generator=None,
-                explanation="No frames.",
-                processing_time_ms=0.0,
             )
-        sample = frames[::6] or [frames[0]]
-        sample_pil = [Image.fromarray(f) for f in sample]
-        # Batched CNN scoring — single pipeline call per detector for all frames
-        cnn_scores = self._batch_cnn_scores(sample_pil)
-        # Geometry scores still per-frame (MediaPipe is CPU-only)
-        geo_scores = [self._geometry_score(np.array(img)) for img in sample_pil]
-        per_frame = [
-            float(np.clip(c * 0.70 + g * 0.30, 0.0, 1.0))
-            for c, g in zip(cnn_scores, geo_scores)
-        ]
-        cnn_geo_avg = float(np.mean(per_frame))
-        # Temporal FFT on GPU
-        fft_score = self._temporal_fft_score(frames)
-        avg = float(np.clip(cnn_geo_avg * 0.80 + fft_score * 0.20, 0.0, 1.0))
-        return EngineResult(
-            engine="sstgnn",
-            verdict="FAKE" if avg > 0.5 else "REAL",
-            confidence=avg,
-            attributed_generator=None,
-            explanation=(
-                f"CNN+geometry avg {cnn_geo_avg:.2f} over {len(sample)} frames, "
-                f"temporal FFT anomaly {fft_score:.2f}."
-            ),
-            processing_time_ms=(time.perf_counter() - t0) * 1000,
-        )
     @staticmethod
     def image_stub() -> EngineResult:

 import torch
 from PIL import Image
 from src.types import EngineResult
 logger = logging.getLogger(__name__)
 CACHE = os.environ.get("MODEL_CACHE_DIR", "/tmp/models")
 _lock = threading.Lock()
 _load_attempted = False
 _detectors: list[Any] = []
     "1",
 )
+KEYPOINT_STEP = 7
+KEYPOINT_COUNT = 68
 def _skip_model_loads() -> bool:
     return os.environ.get("GENAI_SKIP_MODEL_LOAD", "").strip().lower() in {
 def _build_image_classifier(model_id: str) -> Any:
     pipeline = _get_pipeline()
+    attempts: tuple[dict[str, Any], ...] = (
+        {"cache_dir": CACHE, "device": -1},
+        {"device": -1},
         {"cache_dir": CACHE},
         {},
     )
     return float(np.clip(fake_best, 0.0, 1.0))
 class _TasksFaceMeshAdapter:
     def __init__(self, mp_module, landmarker) -> None:
         self._mp = mp_module
         logger.info("Skipping SSTGNN model load (GENAI_SKIP_MODEL_LOAD=1)")
         return
+    logger.info("Loading SSTGNN models on CPU ...")
     try:
         configured_models = [
     except Exception:
         _delaunay = None
+    logger.info("SSTGNN model load attempt complete")
+def _inference_device() -> str:
+    try:
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    except Exception:
+        return "cpu"
+def _move_detector(detector: Any, device: str) -> None:
+    model = getattr(detector, "model", None)
+    if model is not None and hasattr(model, "to"):
+        model.to(device)
+    if hasattr(detector, "device"):
+        detector.device = torch.device(device)
+def _prepare_runtime(device: str) -> None:
+    if device != "cuda":
+        return
+    for detector in _detectors:
+        try:
+            _move_detector(detector, device)
+        except Exception as exc:
+            logger.warning("SSTGNN detector GPU move failed: %s", _short_error(exc))
+def _release_runtime(device: str) -> None:
+    if device != "cuda":
+        return
+    for detector in _detectors:
+        try:
+            _move_detector(detector, "cpu")
+        except Exception:
+            pass
+    try:
+        torch.cuda.empty_cache()
+    except Exception:
+        pass
 class SSTGNNEngine:
     def run(self, image: Image.Image) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
+        device = _inference_device()
+        _prepare_runtime(device)
+        try:
+            if image.mode != "RGB":
+                image = image.convert("RGB")
+            cnn = self._cnn_score(image)
+            graph = self._geometry_score(np.array(image))
+            if _detectors:
+                final = float(np.clip(cnn * 0.70 + graph * 0.30, 0.0, 1.0))
+                note = f"CNN ensemble {cnn:.2f}; geometric graph anomaly {graph:.2f}."
+            else:
+                final = float(np.clip(graph, 0.0, 1.0))
+                note = f"Geometric graph anomaly {graph:.2f} (cnn fallback unavailable)."
+            return EngineResult(
+                engine="sstgnn",
+                verdict="FAKE" if final > 0.5 else "REAL",
+                confidence=final,
+                attributed_generator=None,
+                explanation=note,
+                processing_time_ms=(time.perf_counter() - t0) * 1000,
+            )
+        finally:
+            _release_runtime(device)
     def _cnn_score(self, image: Image.Image) -> float:
         if not _detectors:
         return 0.5
     def _batch_cnn_scores(self, images: list[Image.Image]) -> list[float]:
         if not _detectors or not images:
             return [0.5] * len(images)
         for index, detector in enumerate(_detectors):
             weight = _detector_weights[index] if index < len(_detector_weights) else 1.0
             try:
                 batch_preds = detector(images)
                 for i, preds in enumerate(batch_preds):
                     score = _fake_prob_from_preds(preds if isinstance(preds, list) else [preds])
             arr = np.array(areas, dtype=np.float32)
             cv_score = float(np.std(arr) / (np.mean(arr) + 1e-9))
             return float(np.clip((cv_score - 0.8) / 1.5, 0.0, 1.0))
         except Exception as exc:
             logger.warning("Geometry score error: %s", exc)
             return 0.3
     def _temporal_fft_score(self, frames: list[np.ndarray]) -> float:
         try:
             import cv2  # type: ignore
             gray_stack = np.array(
                 [
                     cv2.resize(
+                        cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+                        if (frame.ndim == 3 and frame.shape[2] >= 3)
+                        else frame[:, :, 0] if frame.ndim == 3 else frame,
                         (32, 32),
                     ).astype(np.float32)
+                    for frame in sampled
                 ]
+            )
+            device = _inference_device()
+            if device == "cuda":
+                gray_tensor = torch.from_numpy(gray_stack).to(device)
+                fft_result = torch.fft.rfft(gray_tensor, dim=0)
                 power = torch.abs(fft_result) ** 2
                 dc_power = power[0].cpu().numpy()
                 total_power = (torch.sum(power, dim=0) + 1e-9).cpu().numpy()
             else:
                 fft_result = np.fft.rfft(gray_stack, axis=0)
                 power = np.abs(fft_result) ** 2
                 dc_power = power[0]
             hf_ratio = 1.0 - (dc_power / total_power)
             mean_hf = float(np.mean(hf_ratio))
+            return float(np.clip(abs(mean_hf - 0.30) / 0.25, 0.0, 1.0))
         except Exception as exc:
             logger.warning("Temporal FFT score error: %s", _short_error(exc))
             return 0.3
     def run_video(self, frames: list[np.ndarray]) -> EngineResult:
         t0 = time.perf_counter()
         self._ensure()
+        device = _inference_device()
+        _prepare_runtime(device)
+        try:
+            if not frames:
+                return EngineResult(
+                    engine="sstgnn",
+                    verdict="REAL",
+                    confidence=0.5,
+                    attributed_generator=None,
+                    explanation="No frames.",
+                    processing_time_ms=0.0,
+                )
+            sample = frames[::6] or [frames[0]]
+            sample_pil = [Image.fromarray(frame) for frame in sample]
+            cnn_scores = self._batch_cnn_scores(sample_pil)
+            geo_scores = [self._geometry_score(np.array(image)) for image in sample_pil]
+            per_frame = [
+                float(np.clip(c * 0.70 + g * 0.30, 0.0, 1.0))
+                for c, g in zip(cnn_scores, geo_scores)
+            ]
+            cnn_geo_avg = float(np.mean(per_frame))
+            fft_score = self._temporal_fft_score(frames)
+            avg = float(np.clip(cnn_geo_avg * 0.80 + fft_score * 0.20, 0.0, 1.0))
             return EngineResult(
                 engine="sstgnn",
+                verdict="FAKE" if avg > 0.5 else "REAL",
+                confidence=avg,
                 attributed_generator=None,
+                explanation=(
+                    f"CNN+geometry avg {cnn_geo_avg:.2f} over {len(sample)} frames, "
+                    f"temporal FFT anomaly {fft_score:.2f}."
+                ),
+                processing_time_ms=(time.perf_counter() - t0) * 1000,
             )
+        finally:
+            _release_runtime(device)
     @staticmethod
     def image_stub() -> EngineResult: