File size: 2,647 Bytes
ee1f3df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os

import httpx
from fastapi import FastAPI, File, Form, UploadFile

app = FastAPI(title="Visual Security Engine Gateway API")


def _engine_d_url() -> str:
    return os.environ.get("ENGINE_D_URL", "http://localhost:8001").rstrip("/")


def _engine_e_url() -> str:
    return os.environ.get("ENGINE_E_URL", "http://localhost:8002").rstrip("/")


def _clamp(value: float) -> float:
    return max(0.0, min(1.0, value))


@app.get("/")
def health_check() -> dict:
    return {"status": "ok", "engine": "gateway"}


@app.post("/analyze")
async def analyze(
    image: UploadFile = File(...),
    audio_transcript: str = Form(""),
    run_caption: bool = Form(True),
    deep: bool = Form(True),
) -> dict:
    image_bytes = await image.read()

    async with httpx.AsyncClient(timeout=300) as client:
        resp_d = await client.post(
            f"{_engine_d_url()}/analyze_d",
            files={"image": (image.filename, image_bytes, image.content_type or "image/jpeg")},
            data={"deep": str(deep).lower()},
        )
        resp_d.raise_for_status()
        payload_d = resp_d.json()

        ocr_text = payload_d.get("ocr", {}).get("normalized_text", "")
        resp_e = await client.post(
            f"{_engine_e_url()}/analyze_e",
            files={"image": (image.filename, image_bytes, image.content_type or "image/jpeg")},
            data={
                "audio_transcript": audio_transcript,
                "ocr_text": ocr_text,
                "run_caption": str(run_caption).lower(),
            },
        )
        resp_e.raise_for_status()
        payload_e = resp_e.json()

    injection = payload_d.get("injection", {})
    ocr_conf = float(payload_d.get("ocr", {}).get("ocr_confidence", 0.5))
    cross_modal = payload_e.get("cross_modal", {})
    ocr_vs_image = payload_e.get("ocr_vs_image", {})
    caption_align = payload_e.get("caption_alignment", {})

    injection_risk = float(injection.get("risk_score", 0.0))
    audio_align = float(cross_modal.get("consistency_score", 0.0))
    ocr_img_align = float(ocr_vs_image.get("consistency_score", 0.0))
    caption_align_score = float(caption_align.get("alignment_score", 0.0))

    final_score = _clamp(
        0.45 * injection_risk
        + 0.15 * (1.0 - ocr_conf)
        + 0.2 * (1.0 - audio_align)
        + 0.1 * (1.0 - ocr_img_align)
        + 0.1 * (1.0 - caption_align_score)
    )

    return {
        "ocr": payload_d.get("ocr", {}),
        "injection": injection,
        "cross_modal": cross_modal,
        "ocr_vs_image": ocr_vs_image,
        "caption_alignment": caption_align,
        "final_score": round(final_score, 3),
    }