Spaces:

arshan123
/

VNITx-Image

Running

App Files Files Community

jaivsh commited on Feb 7

Commit

ee1f3df

0 Parent(s):

add Image_prompt_detection project

Browse files

Files changed (12) hide show

.gitignore +13 -0
README.md +43 -0
app.py +131 -0
requirements.txt +11 -0
src/api/__init__.py +1 -0
src/api/engine_d_server.py +99 -0
src/api/engine_e_server.py +44 -0
src/api/gateway_server.py +83 -0
src/api/server.py +58 -0
src/engines/__init__.py +1 -0
src/engines/visual_engine.py +377 -0
test_visual.py +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+sentinel_env/
+venv/
+.env/
+__pycache__/
+*.pyc
+*.pyo
+.paddlex/
+.cache/
+.DS_Store
+.vscode/

README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+# Multimodal Visual Security Engine (EasyOCR + ONNX DeBERTa + CLIP + BLIP)
+## System Architecture
+```mermaid
+graph TD
+    Input[Input: Image/Video Frame] --> Split{Parallel Process}
+    %% Engine D Logic
+    Split --> EngineD[Engine D: Prompt Injection]
+    EngineD --> OCR[EasyOCR: Extract Text]
+    OCR --> Norm[Normalization Layer]
+    Norm --> InjectModel[DeBERTa Prompt Injection (ONNX)]
+    InjectModel --> ThreatCheck{Threat Dictionary (aux)}
+    ThreatCheck --> RiskScore[Risk Score + Reason]
+    %% Engine E Logic
+    Split --> EngineE[Engine E: Cross-Modal]
+    EngineE --> BLIP[BLIP: Image Caption]
+    InputAudio[Input: Audio Transcript] --> CLIP_Text[CLIP Text Encoder]
+    EngineE --> CLIP_Img[CLIP Image Encoder]
+    CLIP_Text --> Cosine[Cosine Similarity Calc]
+    CLIP_Img --> Cosine
+    Cosine --> Threshold{Is Score < 0.18?}
+    Threshold -- Yes --> Mismatch[Status: MISMATCH - Deepfake]
+    Threshold -- No --> Match[Status: MATCH - Genuine]
+```
+**Engine D (Visual Prompt Injection)**
+OCR-based text extraction + ML classification. EasyOCR extracts visible or hidden text (with CLAHE + Otsu binarization for low-contrast regions), a normalization layer de-obfuscates tokens, and a DeBERTa prompt‑injection classifier (ONNX runtime) scores risk. A small threat dictionary is used as auxiliary evidence in the reason string, not as the primary detector.
+**Engine E (Cross-Modal Consistency)**
+Semantic-based (not OCR). CLIP (ViT-B/32) embeds both the video frame and the audio transcript into a shared vector space to verify that the visual context matches the spoken context. BLIP generates an image caption and we compare it with OCR text to detect prompt/scene misalignment.
+## Quick Start
+```bash
+# Install dependencies
+pip install -r requirements.txt
+# Run the Visual Engine Test
+python -m src.engines.visual_engine
+```

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import httpx
+import streamlit as st
+st.set_page_config(page_title="Visual Security Engine", layout="wide")
+st.title("Visual Security Engine Demo")
+uploaded = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "webp"])
+transcript = st.text_area("Audio transcript (optional)", value="a cat sitting on a ledge")
+with st.sidebar:
+    st.header("API Settings")
+    mode = st.selectbox("API mode", ["gateway", "split"], index=0)
+    gateway_url = st.text_input("Gateway URL", value="http://localhost:8000")
+    engine_d_url = st.text_input("Engine D URL", value="http://localhost:8001")
+    engine_e_url = st.text_input("Engine E URL", value="http://localhost:8002")
+    st.caption("Gateway mode calls a single API. Split mode calls D/E separately.")
+    st.header("Performance")
+    run_ocr = st.checkbox("Show OCR output", value=True)
+    run_injection = st.checkbox("Run prompt-injection model", value=True)
+    run_cross_modal = st.checkbox("Run cross-modal check", value=True)
+    run_caption = st.checkbox("Run BLIP caption alignment", value=True)
+    if run_injection and not run_ocr:
+        st.info("OCR is required for prompt-injection. Enabling OCR display.")
+        run_ocr = True
+run_clicked = st.button("Run analysis", type="primary")
+if run_clicked and not uploaded:
+    st.warning("Please upload an image to continue.")
+if run_clicked and uploaded:
+    image_bytes = uploaded.read()
+    st.image(image_bytes, caption="Uploaded image", use_container_width=True)
+    with st.spinner("Calling APIs for analysis..."):
+        text_payload = {}
+        injection_result = {"skipped": True}
+        cross_modal_result = {"skipped": True}
+        if mode == "gateway":
+            try:
+                response = httpx.post(
+                    f"{gateway_url.rstrip('/')}/analyze",
+                    files={"image": (uploaded.name, image_bytes, uploaded.type or "image/jpeg")},
+                    data={
+                        "audio_transcript": transcript,
+                        "run_caption": str(run_caption).lower(),
+                        "deep": str(run_injection).lower(),
+                    },
+                    timeout=300,
+                )
+                response.raise_for_status()
+            except Exception as exc:
+                st.error("Gateway API call failed. Is it running on the configured URL?")
+                st.exception(exc)
+                st.stop()
+            payload = response.json()
+            text_payload = payload.get("ocr", {})
+            injection_result = payload.get("injection", {})
+            cross_modal_result = payload.get("cross_modal", {})
+            ocr_vs_image = payload.get("ocr_vs_image", {})
+            caption_alignment = payload.get("caption_alignment", {})
+            final_score = payload.get("final_score")
+        else:
+            if run_injection or run_ocr:
+                try:
+                    response_d = httpx.post(
+                        f"{engine_d_url.rstrip('/')}/analyze_d",
+                        files={"image": (uploaded.name, image_bytes, uploaded.type or "image/jpeg")},
+                        data={"deep": str(run_injection).lower()},
+                        timeout=300,
+                    )
+                    response_d.raise_for_status()
+                except Exception as exc:
+                    st.error("Engine D API call failed. Is it running on the configured URL?")
+                    st.exception(exc)
+                else:
+                    payload_d = response_d.json()
+                    text_payload = payload_d.get("ocr", {})
+                    injection_result = payload_d.get("injection", {})
+            if run_cross_modal:
+                try:
+                    response_e = httpx.post(
+                        f"{engine_e_url.rstrip('/')}/analyze_e",
+                        files={"image": (uploaded.name, image_bytes, uploaded.type or "image/jpeg")},
+                        data={
+                            "audio_transcript": transcript,
+                            "ocr_text": text_payload.get("normalized_text", ""),
+                            "run_caption": str(run_caption).lower(),
+                        },
+                        timeout=300,
+                    )
+                    response_e.raise_for_status()
+                except Exception as exc:
+                    st.error("Engine E API call failed. Is it running on the configured URL?")
+                    st.exception(exc)
+                else:
+                    payload_e = response_e.json()
+                    cross_modal_result = payload_e.get("cross_modal", {})
+                    ocr_vs_image = payload_e.get("ocr_vs_image", {})
+                    caption_alignment = payload_e.get("caption_alignment", {})
+            else:
+                ocr_vs_image = {"skipped": True}
+                caption_alignment = {"skipped": True}
+            final_score = None
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("OCR Output")
+            if not run_ocr:
+                st.info("OCR display disabled.")
+            else:
+                st.text_area("Raw text", value=text_payload.get("raw_text", ""), height=150)
+                st.text_area(
+                    "Normalized text", value=text_payload.get("normalized_text", ""), height=120
+                )
+        with col2:
+            st.subheader("Engine D: Prompt Injection")
+            st.json(injection_result)
+            st.subheader("Engine E: Cross-Modal Consistency")
+            st.json(cross_modal_result)
+            st.subheader("OCR vs Image (CLIP)")
+            st.json(ocr_vs_image)
+            st.subheader("Caption Alignment (BLIP)")
+            st.json(caption_alignment)
+            if final_score is not None:
+                st.subheader("Final Risk Score")
+                st.metric("final_score", final_score)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+easyocr
+opencv-python
+sentence-transformers
+transformers
+httpx
+streamlit
+optimum
+onnxruntime
+fastapi
+uvicorn
+python-multipart

src/api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """API package for Sentinel-X."""

src/api/engine_d_server.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from typing import List
+from fastapi import FastAPI, File, Form, UploadFile
+from src.engines.visual_engine import PromptInjectionEngine, THREAT_DICTIONARY
+app = FastAPI(title="Engine D (Prompt Injection) API")
+_ENGINE: PromptInjectionEngine | None = None
+@app.on_event("startup")
+def load_engine() -> None:
+    global _ENGINE
+    if _ENGINE is None:
+        _ENGINE = PromptInjectionEngine(use_onnx=True)
+@app.get("/")
+def health_check() -> dict:
+    return {"status": "ok", "engine": "d"}
+@app.post("/analyze_d")
+async def analyze_engine_d(
+    image: UploadFile = File(...),
+    deep: bool = Form(True),
+) -> dict:
+    if _ENGINE is None:
+        load_engine()
+    engine = _ENGINE
+    image_bytes = await image.read()
+    text_payload = engine.extract_text(image_bytes)
+    normalized_text = text_payload["normalized_text"]
+    matched = [phrase for phrase in THREAT_DICTIONARY if phrase in normalized_text]
+    scores = [score for _, score in text_payload.get("scored", [])]
+    ocr_confidence = float(sum(scores) / len(scores)) if scores else 0.5
+    if deep:
+        injection_result = engine.detect_injection_from_text(normalized_text, matched_phrases=matched)
+    else:
+        injection_result = {
+            "is_threat": bool(matched),
+            "risk_score": 0.9 if matched else 0.1,
+            "reason": "FastPathRegex",
+        }
+    return {
+        "ocr": {**text_payload, "ocr_confidence": round(ocr_confidence, 3)},
+        "injection": injection_result,
+    }
+@app.post("/analyze_d_batch")
+async def analyze_engine_d_batch(
+    images: List[UploadFile] = File(...),
+    deep: bool = Form(True),
+) -> dict:
+    if _ENGINE is None:
+        load_engine()
+    engine = _ENGINE
+    normalized_batch: List[str] = []
+    ocr_payloads: List[dict] = []
+    matched_batch: List[List[str]] = []
+    for img in images:
+        image_bytes = await img.read()
+        payload = engine.extract_text(image_bytes)
+        scores = [score for _, score in payload.get("scored", [])]
+        payload["ocr_confidence"] = round(float(sum(scores) / len(scores)) if scores else 0.5, 3)
+        ocr_payloads.append(payload)
+        normalized_text = payload["normalized_text"]
+        normalized_batch.append(normalized_text)
+        matched_batch.append([phrase for phrase in THREAT_DICTIONARY if phrase in normalized_text])
+    results: List[dict] = []
+    if deep:
+        # Batch the DeBERTa pipeline to utilize parallelism.
+        classifier = engine._get_injection_classifier()
+        classifications = classifier(normalized_batch, top_k=1)
+        for idx, classification in enumerate(classifications):
+            label = str(classification.get("label", "")).upper()
+            score = float(classification.get("score", 0.0))
+            is_injection = "1" in label or "INJECTION" in label
+            risk_score = score if is_injection else 1.0 - score
+            reason = f"Model={label or 'UNKNOWN'}; model_score={score:.3f}"
+            if matched_batch[idx]:
+                reason += f"; matched_phrases={', '.join(sorted(set(matched_batch[idx])))}"
+            results.append(
+                {"is_threat": bool(is_injection), "risk_score": round(risk_score, 3), "reason": reason}
+            )
+    else:
+        for matched in matched_batch:
+            results.append(
+                {
+                    "is_threat": bool(matched),
+                    "risk_score": 0.9 if matched else 0.1,
+                    "reason": "FastPathRegex",
+                }
+            )
+    return {"ocr": ocr_payloads, "injection": results}

src/api/engine_e_server.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from fastapi import FastAPI, File, Form, UploadFile
+from src.engines.visual_engine import CrossModalEngine
+app = FastAPI(title="Engine E (Cross-Modal) API")
+_ENGINE: CrossModalEngine | None = None
+@app.on_event("startup")
+def load_engine() -> None:
+    global _ENGINE
+    if _ENGINE is None:
+        _ENGINE = CrossModalEngine()
+@app.get("/")
+def health_check() -> dict:
+    return {"status": "ok", "engine": "e"}
+@app.post("/analyze_e")
+async def analyze_engine_e(
+    image: UploadFile = File(...),
+    audio_transcript: str = Form(""),
+    ocr_text: str = Form(""),
+    run_caption: bool = Form(True),
+) -> dict:
+    if _ENGINE is None:
+        load_engine()
+    engine = _ENGINE
+    image_bytes = await image.read()
+    cross_modal_result = engine.check_cross_modal(image_bytes, audio_transcript)
+    ocr_vs_image = engine.check_ocr_vs_image(image_bytes, ocr_text) if ocr_text else {
+        "is_mismatch": False,
+        "consistency_score": 0.0,
+    }
+    caption_alignment = (
+        engine.check_caption_alignment(image_bytes, ocr_text) if run_caption else {"caption": "", "alignment_score": 0.0}
+    )
+    return {
+        "cross_modal": cross_modal_result,
+        "ocr_vs_image": ocr_vs_image,
+        "caption_alignment": caption_alignment,
+    }

src/api/gateway_server.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import httpx
+from fastapi import FastAPI, File, Form, UploadFile
+app = FastAPI(title="Visual Security Engine Gateway API")
+def _engine_d_url() -> str:
+    return os.environ.get("ENGINE_D_URL", "http://localhost:8001").rstrip("/")
+def _engine_e_url() -> str:
+    return os.environ.get("ENGINE_E_URL", "http://localhost:8002").rstrip("/")
+def _clamp(value: float) -> float:
+    return max(0.0, min(1.0, value))
+@app.get("/")
+def health_check() -> dict:
+    return {"status": "ok", "engine": "gateway"}
+@app.post("/analyze")
+async def analyze(
+    image: UploadFile = File(...),
+    audio_transcript: str = Form(""),
+    run_caption: bool = Form(True),
+    deep: bool = Form(True),
+) -> dict:
+    image_bytes = await image.read()
+    async with httpx.AsyncClient(timeout=300) as client:
+        resp_d = await client.post(
+            f"{_engine_d_url()}/analyze_d",
+            files={"image": (image.filename, image_bytes, image.content_type or "image/jpeg")},
+            data={"deep": str(deep).lower()},
+        )
+        resp_d.raise_for_status()
+        payload_d = resp_d.json()
+        ocr_text = payload_d.get("ocr", {}).get("normalized_text", "")
+        resp_e = await client.post(
+            f"{_engine_e_url()}/analyze_e",
+            files={"image": (image.filename, image_bytes, image.content_type or "image/jpeg")},
+            data={
+                "audio_transcript": audio_transcript,
+                "ocr_text": ocr_text,
+                "run_caption": str(run_caption).lower(),
+            },
+        )
+        resp_e.raise_for_status()
+        payload_e = resp_e.json()
+    injection = payload_d.get("injection", {})
+    ocr_conf = float(payload_d.get("ocr", {}).get("ocr_confidence", 0.5))
+    cross_modal = payload_e.get("cross_modal", {})
+    ocr_vs_image = payload_e.get("ocr_vs_image", {})
+    caption_align = payload_e.get("caption_alignment", {})
+    injection_risk = float(injection.get("risk_score", 0.0))
+    audio_align = float(cross_modal.get("consistency_score", 0.0))
+    ocr_img_align = float(ocr_vs_image.get("consistency_score", 0.0))
+    caption_align_score = float(caption_align.get("alignment_score", 0.0))
+    final_score = _clamp(
+        0.45 * injection_risk
+        + 0.15 * (1.0 - ocr_conf)
+        + 0.2 * (1.0 - audio_align)
+        + 0.1 * (1.0 - ocr_img_align)
+        + 0.1 * (1.0 - caption_align_score)
+    )
+    return {
+        "ocr": payload_d.get("ocr", {}),
+        "injection": injection,
+        "cross_modal": cross_modal,
+        "ocr_vs_image": ocr_vs_image,
+        "caption_alignment": caption_align,
+        "final_score": round(final_score, 3),
+    }

src/api/server.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from fastapi import FastAPI, File, Form, UploadFile
+from src.engines.visual_engine import VisualSecurityEngine
+app = FastAPI(title="Visual Security Engine API")
+_ENGINE: VisualSecurityEngine | None = None
+@app.on_event("startup")
+def load_engine() -> None:
+    global _ENGINE
+    if _ENGINE is None:
+        _ENGINE = VisualSecurityEngine()
+@app.get("/")
+def health_check() -> dict:
+    return {"status": "ok"}
+@app.post("/analyze")
+async def analyze_image(
+    image: UploadFile = File(...),
+    audio_transcript: str = Form(""),
+    run_ocr: bool = Form(True),
+    run_injection: bool = Form(True),
+    run_cross_modal: bool = Form(True),
+) -> dict:
+    if _ENGINE is None:
+        load_engine()
+    engine = _ENGINE
+    image_bytes = await image.read()
+    if run_injection:
+        run_ocr = True
+    text_payload = None
+    if run_ocr:
+        text_payload = engine.extract_text(image_bytes)
+    if run_injection:
+        injection_result = engine.detect_injection_from_text(
+            text_payload["normalized_text"] if text_payload else ""
+        )
+    else:
+        injection_result = {"skipped": True}
+    if run_cross_modal and audio_transcript.strip():
+        cross_modal_result = engine.check_cross_modal(image_bytes, audio_transcript)
+    elif run_cross_modal:
+        cross_modal_result = {"is_mismatch": True, "consistency_score": 0.0}
+    else:
+        cross_modal_result = {"skipped": True}
+    return {
+        "ocr": text_payload or {"skipped": True},
+        "injection": injection_result,
+        "cross_modal": cross_modal_result,
+    }

src/engines/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Engines package for Sentinel-X."""

src/engines/visual_engine.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import io
+import os
+import re
+import urllib.request
+from typing import Any, Dict, Iterable, List, Tuple, Union
+import numpy as np
+import cv2
+from PIL import Image
+import easyocr
+from sentence_transformers import SentenceTransformer
+import torch
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    BlipForConditionalGeneration,
+    BlipProcessor,
+    pipeline,
+)
+try:
+    from optimum.onnxruntime import ORTModelForSequenceClassification
+    _HAS_ORT = True
+except Exception:
+    _HAS_ORT = False
+THREAT_DICTIONARY = [
+    "ignore previous",
+    "system override",
+    "transfer funds",
+    "bypass safety",
+    "disable guardrails",
+    "override policy",
+    "reveal secrets",
+]
+class PromptInjectionEngine:
+    def __init__(
+        self,
+        use_onnx: bool | None = None,
+        force_cpu: bool | None = None,
+        model_name: str | None = None,
+    ) -> None:
+        os.environ.setdefault("HF_HUB_TIMEOUT", "60")
+        os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "60")
+        os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
+        self._ocr: easyocr.Reader | None = None
+        self._injection_classifier = None
+        self._model_name = model_name or "protectai/deberta-v3-base-prompt-injection"
+        if force_cpu is None:
+            self._force_cpu = os.environ.get("SENTINEL_FORCE_CPU", "").lower() in {
+                "1",
+                "true",
+                "yes",
+            }
+        else:
+            self._force_cpu = force_cpu
+        if use_onnx is None:
+            self._use_onnx = os.environ.get("SENTINEL_USE_ONNX", "1") not in {"0", "false"}
+        else:
+            self._use_onnx = use_onnx
+    def _get_ocr(self) -> easyocr.Reader:
+        if self._ocr is None:
+            ocr_gpu = os.environ.get("SENTINEL_OCR_GPU", "1") not in {"0", "false"}
+            try:
+                self._ocr = easyocr.Reader(["en"], gpu=ocr_gpu)
+            except Exception:
+                self._ocr = easyocr.Reader(["en"], gpu=False)
+        return self._ocr
+    def _get_injection_classifier(self):
+        if self._injection_classifier is None:
+            if self._use_onnx and _HAS_ORT:
+                try:
+                    tokenizer = AutoTokenizer.from_pretrained(
+                        self._model_name, subfolder="onnx", local_files_only=True
+                    )
+                    model = ORTModelForSequenceClassification.from_pretrained(
+                        self._model_name, subfolder="onnx", export=False, local_files_only=True
+                    )
+                except Exception:
+                    tokenizer = AutoTokenizer.from_pretrained(self._model_name, subfolder="onnx")
+                    model = ORTModelForSequenceClassification.from_pretrained(
+                        self._model_name, subfolder="onnx", export=False
+                    )
+                self._injection_classifier = pipeline(
+                    "text-classification",
+                    model=model,
+                    tokenizer=tokenizer,
+                    truncation=True,
+                    max_length=512,
+                )
+            else:
+                try:
+                    tokenizer = AutoTokenizer.from_pretrained(
+                        self._model_name, local_files_only=True
+                    )
+                    model = AutoModelForSequenceClassification.from_pretrained(
+                        self._model_name, local_files_only=True
+                    )
+                except Exception:
+                    tokenizer = AutoTokenizer.from_pretrained(self._model_name)
+                    model = AutoModelForSequenceClassification.from_pretrained(self._model_name)
+                device = torch.device(
+                    "cpu"
+                    if self._force_cpu or not torch.backends.mps.is_available()
+                    else "mps"
+                )
+                self._injection_classifier = pipeline(
+                    "text-classification",
+                    model=model,
+                    tokenizer=tokenizer,
+                    truncation=True,
+                    max_length=512,
+                    device=device,
+                )
+        return self._injection_classifier
+    @staticmethod
+    def _normalize_text(text: str) -> str:
+        lowered = text.lower()
+        cleaned = re.sub(r"[^a-z0-9]+", " ", lowered)
+        tokens = cleaned.split()
+        def merge_single_letter_runs(items: Iterable[str]) -> List[str]:
+            merged: List[str] = []
+            run: List[str] = []
+            for token in items:
+                if len(token) == 1:
+                    run.append(token)
+                    continue
+                if run:
+                    merged.append("".join(run))
+                    run = []
+                merged.append(token)
+            if run:
+                merged.append("".join(run))
+            return merged
+        merged_tokens = merge_single_letter_runs(tokens)
+        return " ".join(merged_tokens)
+    @staticmethod
+    def _load_image_for_ocr(image: Union[str, bytes]) -> Union[str, np.ndarray]:
+        if isinstance(image, str):
+            return image
+        pil_image = Image.open(io.BytesIO(image)).convert("RGB")
+        return np.array(pil_image)
+    @staticmethod
+    def _enhance_for_hidden_text(image: np.ndarray) -> np.ndarray:
+        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        enhanced = clahe.apply(gray)
+        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        return cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
+    @staticmethod
+    def _load_image_for_clip(image: Union[str, bytes]) -> Image.Image:
+        if isinstance(image, str):
+            return Image.open(image).convert("RGB")
+        return Image.open(io.BytesIO(image)).convert("RGB")
+    @staticmethod
+    def _extract_ocr_text(ocr_result: List[Any]) -> Tuple[str, List[Tuple[str, float]]]:
+        fragments: List[str] = []
+        scored: List[Tuple[str, float]] = []
+        # EasyOCR returns: [([bbox], text, confidence), ...]
+        for line in ocr_result or []:
+            if not line or len(line) < 2:
+                continue
+            text = str(line[1])
+            score = float(line[2]) if len(line) > 2 and isinstance(line[2], (float, int)) else None
+            if text:
+                fragments.append(text)
+                if score is not None:
+                    scored.append((text, score))
+        return " ".join(fragments), scored
+    def detect_injection(self, image: Union[str, bytes]) -> Dict[str, Any]:
+        text_payload = self.extract_text(image)
+        return self.detect_injection_from_text(
+            text_payload["normalized_text"],
+            matched_phrases=[
+                phrase for phrase in THREAT_DICTIONARY if phrase in text_payload["normalized_text"]
+            ],
+        )
+    def detect_injection_from_text(
+        self, normalized_text: str, matched_phrases: List[str] | None = None
+    ) -> Dict[str, Any]:
+        if not normalized_text:
+            return {
+                "is_threat": False,
+                "risk_score": 0.0,
+                "reason": "No readable text detected in image.",
+            }
+        matched = matched_phrases or [
+            phrase for phrase in THREAT_DICTIONARY if phrase in normalized_text
+        ]
+        try:
+            classifier = self._get_injection_classifier()
+            classification = classifier(normalized_text, top_k=1)[0]
+            label = str(classification.get("label", "")).upper()
+            score = float(classification.get("score", 0.0))
+            is_injection = "1" in label or "INJECTION" in label
+            risk_score = score if is_injection else 1.0 - score
+            reason_parts = [
+                f"Model={label or 'UNKNOWN'}",
+                f"model_score={score:.3f}",
+            ]
+        except Exception:
+            is_injection = bool(matched)
+            risk_score = 0.9 if matched else 0.1
+            reason_parts = ["Model=FALLBACK", "model_score=0.0"]
+        if matched:
+            reason_parts.append(f"matched_phrases={', '.join(sorted(set(matched)))}")
+        return {
+            "is_threat": bool(is_injection),
+            "risk_score": round(risk_score, 3),
+            "reason": "; ".join(reason_parts),
+        }
+    def extract_text(self, image: Union[str, bytes]) -> Dict[str, Any]:
+        ocr_input = self._load_image_for_ocr(image)
+        reader = self._get_ocr()
+        if isinstance(ocr_input, str):
+            ocr_result = reader.readtext(ocr_input)
+            raw_text, scored = self._extract_ocr_text(ocr_result)
+            normalized = self._normalize_text(raw_text)
+        else:
+            base_result = reader.readtext(ocr_input)
+            enhanced_image = self._enhance_for_hidden_text(ocr_input)
+            enhanced_result = reader.readtext(enhanced_image)
+            raw_text_base, scored_base = self._extract_ocr_text(base_result)
+            raw_text_enh, scored_enh = self._extract_ocr_text(enhanced_result)
+            raw_text = " ".join([raw_text_base, raw_text_enh]).strip()
+            scored = scored_base + scored_enh
+            normalized = self._normalize_text(raw_text)
+        return {
+            "raw_text": raw_text,
+            "normalized_text": normalized,
+            "scored": scored,
+        }
+class CrossModalEngine:
+    def __init__(self, clip_model: str | None = None, caption_model: str | None = None) -> None:
+        self._clip = SentenceTransformer(
+            clip_model or os.environ.get("SENTINEL_CLIP_MODEL", "clip-ViT-B-32")
+        )
+        self._captioner = None
+        self._caption_model = caption_model or os.environ.get(
+            "SENTINEL_BLIP_MODEL", "Salesforce/blip-image-captioning-base"
+        )
+    @staticmethod
+    def _load_image_for_clip(image: Union[str, bytes]) -> Image.Image:
+        if isinstance(image, str):
+            return Image.open(image).convert("RGB")
+        return Image.open(io.BytesIO(image)).convert("RGB")
+    def _get_captioner(self):
+        if self._captioner is None:
+            # Use BLIP processor + model directly to avoid pipeline task mismatches.
+            processor = BlipProcessor.from_pretrained(self._caption_model)
+            model = BlipForConditionalGeneration.from_pretrained(self._caption_model)
+            device = os.environ.get("SENTINEL_BLIP_DEVICE", "cpu")
+            model.to(device)
+            self._captioner = (processor, model, device)
+        return self._captioner
+    def check_cross_modal(self, image: Union[str, bytes], audio_transcript: str) -> Dict[str, Any]:
+        if not audio_transcript:
+            return {"is_mismatch": True, "consistency_score": 0.0}
+        pil_image = self._load_image_for_clip(image)
+        image_emb = self._clip.encode([pil_image], normalize_embeddings=True)
+        text_emb = self._clip.encode([audio_transcript], normalize_embeddings=True)
+        similarity = float(np.dot(image_emb[0], text_emb[0]))
+        return {
+            "is_mismatch": similarity < 0.18,
+            "consistency_score": round(similarity, 4),
+        }
+    def check_ocr_vs_image(self, image: Union[str, bytes], ocr_text: str) -> Dict[str, Any]:
+        if not ocr_text:
+            return {"is_mismatch": False, "consistency_score": 0.0}
+        pil_image = self._load_image_for_clip(image)
+        image_emb = self._clip.encode([pil_image], normalize_embeddings=True)
+        text_emb = self._clip.encode([ocr_text], normalize_embeddings=True)
+        similarity = float(np.dot(image_emb[0], text_emb[0]))
+        return {
+            "is_mismatch": similarity < 0.18,
+            "consistency_score": round(similarity, 4),
+        }
+    def check_caption_alignment(self, image: Union[str, bytes], ocr_text: str) -> Dict[str, Any]:
+        if not ocr_text:
+            return {"caption": "", "alignment_score": 0.0}
+        pil_image = self._load_image_for_clip(image)
+        processor, model, device = self._get_captioner()
+        inputs = processor(images=pil_image, return_tensors="pt").to(device)
+        output_ids = model.generate(**inputs, max_new_tokens=30)
+        caption = processor.decode(output_ids[0], skip_special_tokens=True)
+        text_emb = self._clip.encode([ocr_text], normalize_embeddings=True)
+        caption_emb = self._clip.encode([caption], normalize_embeddings=True)
+        similarity = float(np.dot(text_emb[0], caption_emb[0]))
+        return {"caption": caption, "alignment_score": round(similarity, 4)}
+class VisualSecurityEngine:
+    def __init__(
+        self,
+        use_onnx: bool | None = None,
+        force_cpu: bool | None = None,
+        clip_model: str | None = None,
+    ) -> None:
+        self.engine_d = PromptInjectionEngine(use_onnx=use_onnx, force_cpu=force_cpu)
+        self.engine_e = CrossModalEngine(clip_model=clip_model)
+    def extract_text(self, image: Union[str, bytes]) -> Dict[str, Any]:
+        return self.engine_d.extract_text(image)
+    def detect_injection(self, image: Union[str, bytes]) -> Dict[str, Any]:
+        return self.engine_d.detect_injection(image)
+    def detect_injection_from_text(
+        self, normalized_text: str, matched_phrases: List[str] | None = None
+    ) -> Dict[str, Any]:
+        return self.engine_d.detect_injection_from_text(normalized_text, matched_phrases)
+    def check_cross_modal(self, image: Union[str, bytes], audio_transcript: str) -> Dict[str, Any]:
+        return self.engine_e.check_cross_modal(image, audio_transcript)
+    def check_ocr_vs_image(self, image: Union[str, bytes], ocr_text: str) -> Dict[str, Any]:
+        return self.engine_e.check_ocr_vs_image(image, ocr_text)
+    def check_caption_alignment(self, image: Union[str, bytes], ocr_text: str) -> Dict[str, Any]:
+        return self.engine_e.check_caption_alignment(image, ocr_text)
+def _download_demo_image() -> bytes:
+    demo_urls = [
+        "https://upload.wikimedia.org/wikipedia/commons/3/3a/Cat03.jpg",
+        "https://upload.wikimedia.org/wikipedia/commons/7/74/A-Cat.jpg",
+    ]
+    headers = {"User-Agent": "Mozilla/5.0 (Sentinel-X demo)"}
+    last_error: Exception | None = None
+    for url in demo_urls:
+        try:
+            request = urllib.request.Request(url, headers=headers)
+            with urllib.request.urlopen(request, timeout=20) as response:
+                return response.read()
+        except Exception as exc:  # pragma: no cover - best effort demo download
+            last_error = exc
+            continue
+    raise RuntimeError(f"Failed to download demo image: {last_error}")
+if __name__ == "__main__":
+    demo_bytes = _download_demo_image()
+    engine = VisualSecurityEngine()
+    injection_result = engine.detect_injection(demo_bytes)
+    cross_modal_result = engine.check_cross_modal(demo_bytes, "a cat sitting on a ledge")
+    print("Injection detection:", injection_result)
+    print("Cross-modal consistency:", cross_modal_result)

test_visual.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ if __name__ == "__main__":
2	+ print("Run visual engine tests once visual_engine.py is ready.")