Spaces:
Sleeping
Sleeping
Commit ·
cf19f11
0
Parent(s):
Prepare HF Docker deployment: add root app.py, Dockerfile, startup handling
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .DS_Store +0 -0
- Dockerfile +17 -0
- Readme.md +8 -0
- app.py +2 -0
- app/.DS_Store +0 -0
- app/__init__.py +0 -0
- app/__pycache__/__init__.cpython-310.pyc +0 -0
- app/__pycache__/__init__.cpython-311.pyc +0 -0
- app/__pycache__/__init__.cpython-313.pyc +0 -0
- app/__pycache__/main.cpython-310.pyc +0 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/__pycache__/main.cpython-313.pyc +0 -0
- app/app.py +49 -0
- app/config/.DS_Store +0 -0
- app/config/__init__.py +0 -0
- app/config/settings.py +0 -0
- app/data/.DS_Store +0 -0
- app/data/__init__.py +0 -0
- app/embeddings/__init__.py +0 -0
- app/embeddings/__pycache__/__init__.cpython-310.pyc +0 -0
- app/embeddings/__pycache__/__init__.cpython-311.pyc +0 -0
- app/embeddings/__pycache__/embedding_model.cpython-310.pyc +0 -0
- app/embeddings/__pycache__/embedding_model.cpython-311.pyc +0 -0
- app/embeddings/embedding_model.py +15 -0
- app/models/__init__.py +0 -0
- app/models/__pycache__/__init__.cpython-310.pyc +0 -0
- app/models/__pycache__/__init__.cpython-311.pyc +0 -0
- app/models/__pycache__/model1_classifier.cpython-310.pyc +0 -0
- app/models/__pycache__/model1_classifier.cpython-311.pyc +0 -0
- app/models/__pycache__/model2_risk.cpython-310.pyc +0 -0
- app/models/__pycache__/model2_risk.cpython-311.pyc +0 -0
- app/models/__pycache__/model3_action.cpython-310.pyc +0 -0
- app/models/__pycache__/model3_action.cpython-311.pyc +0 -0
- app/models/model1_classifier.py +112 -0
- app/models/model2_risk.py +143 -0
- app/models/model3_action.py +105 -0
- app/notebooks/__init__.py +0 -0
- app/pipelines/__init__.py +0 -0
- app/pipelines/__pycache__/__init__.cpython-310.pyc +0 -0
- app/pipelines/__pycache__/__init__.cpython-311.pyc +0 -0
- app/pipelines/__pycache__/security_pipeline.cpython-310.pyc +0 -0
- app/pipelines/__pycache__/security_pipeline.cpython-311.pyc +0 -0
- app/pipelines/security_pipeline.py +104 -0
- app/rag.py +0 -0
- app/rag/__init__.py +0 -0
- app/rag/__pycache__/__init__.cpython-310.pyc +0 -0
- app/rag/__pycache__/__init__.cpython-311.pyc +0 -0
- app/rag/__pycache__/knowledge_loader.cpython-310.pyc +0 -0
- app/rag/__pycache__/knowledge_loader.cpython-311.pyc +0 -0
- app/rag/__pycache__/rag_engine.cpython-310.pyc +0 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# set workdir
|
| 4 |
+
WORKDIR /code
|
| 5 |
+
|
| 6 |
+
# copy repo
|
| 7 |
+
COPY . /code
|
| 8 |
+
|
| 9 |
+
# upgrade pip then install
|
| 10 |
+
RUN pip install --upgrade pip
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# expose the HF required port
|
| 14 |
+
EXPOSE 7860
|
| 15 |
+
|
| 16 |
+
# run uvicorn pointing to the app object exposed at repo-root/app.py -> app.app:app would also work
|
| 17 |
+
CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
Readme.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Email Security AI
|
| 3 |
+
emoji: "🛡️"
|
| 4 |
+
colorFrom: "blue"
|
| 5 |
+
colorTo: "purple"
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
---
|
app.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# repo-root/app.py
|
| 2 |
+
from app.app import app
|
app/.DS_Store
ADDED
|
Binary file (10.2 kB). View file
|
|
|
app/__init__.py
ADDED
|
File without changes
|
app/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (147 Bytes). View file
|
|
|
app/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (163 Bytes). View file
|
|
|
app/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (151 Bytes). View file
|
|
|
app/__pycache__/main.cpython-310.pyc
ADDED
|
Binary file (1.72 kB). View file
|
|
|
app/__pycache__/main.cpython-311.pyc
ADDED
|
Binary file (2.71 kB). View file
|
|
|
app/__pycache__/main.cpython-313.pyc
ADDED
|
Binary file (2.38 kB). View file
|
|
|
app/app.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/main.py
|
| 2 |
+
import logging
|
| 3 |
+
from fastapi import FastAPI
|
| 4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
+
|
| 6 |
+
# import the router (assumes app/routes/analyze.py exists)
|
| 7 |
+
from app.routes.analyze import router as analyze_router
|
| 8 |
+
|
| 9 |
+
# import pipeline to warm up singletons on startup
|
| 10 |
+
import app.pipelines.security_pipeline as security_pipeline
|
| 11 |
+
|
| 12 |
+
logging.basicConfig(level=logging.INFO)
|
| 13 |
+
logger = logging.getLogger("email-security-ai")
|
| 14 |
+
|
| 15 |
+
app = FastAPI(
|
| 16 |
+
title="Email Security AI",
|
| 17 |
+
version="1.0",
|
| 18 |
+
description="Model-1 -> Model-2 -> RAG -> Model-3 email security pipeline"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# Allow frontend access during development; tighten in prod.
|
| 22 |
+
app.add_middleware(
|
| 23 |
+
CORSMiddleware,
|
| 24 |
+
allow_origins=["*"], # change to specific origins in production
|
| 25 |
+
allow_credentials=True,
|
| 26 |
+
allow_methods=["*"],
|
| 27 |
+
allow_headers=["*"],
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
app.include_router(analyze_router, prefix="")
|
| 31 |
+
|
| 32 |
+
@app.on_event("startup")
|
| 33 |
+
async def startup_event():
|
| 34 |
+
"""
|
| 35 |
+
Ensure model singletons are created and log basic info.
|
| 36 |
+
Note: model initialization (importing security_pipeline) may trigger HF model downloads
|
| 37 |
+
on first run — that is expected.
|
| 38 |
+
"""
|
| 39 |
+
logger.info("Starting Email Security AI — warming models (may take time on first run).")
|
| 40 |
+
try:
|
| 41 |
+
# Load the newly decoupled pipeline models to memory before serving
|
| 42 |
+
security_pipeline.load_models()
|
| 43 |
+
logger.info("Model singletons initialized and ready for inference.")
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.exception("Error while warming models: %s", e)
|
| 46 |
+
|
| 47 |
+
@app.get("/health", tags=["health"])
|
| 48 |
+
def health():
|
| 49 |
+
return {"status": "ok", "ready": True}
|
app/config/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
app/config/__init__.py
ADDED
|
File without changes
|
app/config/settings.py
ADDED
|
File without changes
|
app/data/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
app/data/__init__.py
ADDED
|
File without changes
|
app/embeddings/__init__.py
ADDED
|
File without changes
|
app/embeddings/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (158 Bytes). View file
|
|
|
app/embeddings/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (174 Bytes). View file
|
|
|
app/embeddings/__pycache__/embedding_model.cpython-310.pyc
ADDED
|
Binary file (734 Bytes). View file
|
|
|
app/embeddings/__pycache__/embedding_model.cpython-311.pyc
ADDED
|
Binary file (953 Bytes). View file
|
|
|
app/embeddings/embedding_model.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class EmbeddingModel:
|
| 5 |
+
|
| 6 |
+
def __init__(self):
|
| 7 |
+
|
| 8 |
+
self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
| 9 |
+
|
| 10 |
+
self.embedding = HuggingFaceEmbeddings(
|
| 11 |
+
model_name=self.model_name
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
def get(self):
|
| 15 |
+
return self.embedding
|
app/models/__init__.py
ADDED
|
File without changes
|
app/models/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (154 Bytes). View file
|
|
|
app/models/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (170 Bytes). View file
|
|
|
app/models/__pycache__/model1_classifier.cpython-310.pyc
ADDED
|
Binary file (2.92 kB). View file
|
|
|
app/models/__pycache__/model1_classifier.cpython-311.pyc
ADDED
|
Binary file (2.44 kB). View file
|
|
|
app/models/__pycache__/model2_risk.cpython-310.pyc
ADDED
|
Binary file (3.32 kB). View file
|
|
|
app/models/__pycache__/model2_risk.cpython-311.pyc
ADDED
|
Binary file (4.98 kB). View file
|
|
|
app/models/__pycache__/model3_action.cpython-310.pyc
ADDED
|
Binary file (3.74 kB). View file
|
|
|
app/models/__pycache__/model3_action.cpython-311.pyc
ADDED
|
Binary file (7.51 kB). View file
|
|
|
app/models/model1_classifier.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import torch
|
| 3 |
+
from langchain_core.runnables import RunnableLambda
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
|
| 6 |
+
CANDIDATE_LABELS = [
|
| 7 |
+
"phishing",
|
| 8 |
+
"scam",
|
| 9 |
+
"fraud",
|
| 10 |
+
"malware",
|
| 11 |
+
"spam",
|
| 12 |
+
"benign"
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
class Model1Classifier:
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.model_name = "facebook/bart-large-mnli"
|
| 18 |
+
self.model_version = "model1-v1"
|
| 19 |
+
|
| 20 |
+
# Determine device
|
| 21 |
+
if torch.backends.mps.is_available():
|
| 22 |
+
device = "mps"
|
| 23 |
+
elif torch.cuda.is_available():
|
| 24 |
+
device = "cuda"
|
| 25 |
+
else:
|
| 26 |
+
device = "cpu"
|
| 27 |
+
|
| 28 |
+
# Load HuggingFace pipeline with zero-shot classification
|
| 29 |
+
self.hf_pipeline = pipeline(
|
| 30 |
+
"zero-shot-classification",
|
| 31 |
+
model=self.model_name,
|
| 32 |
+
device=device
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
def _run_inference(self, input_text: str) -> dict:
|
| 36 |
+
"""Internal method called by the LangChain Runnable"""
|
| 37 |
+
# Run inference using softmax over all classes and a targeted hypothesis template
|
| 38 |
+
# `multi_label=False` forces the probabilities to sum to 1.0 (Softmax)
|
| 39 |
+
result = self.hf_pipeline(
|
| 40 |
+
input_text,
|
| 41 |
+
candidate_labels=CANDIDATE_LABELS,
|
| 42 |
+
multi_label=False,
|
| 43 |
+
hypothesis_template="This email is {}."
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
labels = result["labels"]
|
| 47 |
+
scores = result["scores"]
|
| 48 |
+
|
| 49 |
+
# --- Heuristic Overrides for extremely short scam messages ---
|
| 50 |
+
text_lower = input_text.lower()
|
| 51 |
+
if "won an iphone" in text_lower or (
|
| 52 |
+
"congratulations" in text_lower and ("won" in text_lower or "prize" in text_lower)
|
| 53 |
+
):
|
| 54 |
+
target_phishing_prob = 0.65
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
p_index = labels.index("phishing")
|
| 58 |
+
current_p_prob = scores[p_index]
|
| 59 |
+
except ValueError:
|
| 60 |
+
p_index = -1
|
| 61 |
+
current_p_prob = 0.0
|
| 62 |
+
|
| 63 |
+
if current_p_prob < target_phishing_prob:
|
| 64 |
+
remaining_target = 1.0 - target_phishing_prob
|
| 65 |
+
remaining_current = 1.0 - current_p_prob
|
| 66 |
+
scale_factor = (remaining_target / remaining_current) if remaining_current > 0 else 0
|
| 67 |
+
|
| 68 |
+
for i in range(len(scores)):
|
| 69 |
+
if labels[i] == "phishing":
|
| 70 |
+
scores[i] = target_phishing_prob
|
| 71 |
+
else:
|
| 72 |
+
scores[i] = scores[i] * scale_factor
|
| 73 |
+
|
| 74 |
+
# Re-sort lists post-override
|
| 75 |
+
sorted_pairs = sorted(zip(labels, scores), key=lambda x: x[1], reverse=True)
|
| 76 |
+
labels = [p[0] for p in sorted_pairs]
|
| 77 |
+
scores = [p[1] for p in sorted_pairs]
|
| 78 |
+
|
| 79 |
+
top_labels = []
|
| 80 |
+
for label, score in list(zip(labels, scores))[:5]:
|
| 81 |
+
top_labels.append({
|
| 82 |
+
"label": label,
|
| 83 |
+
"probability": round(float(score), 4)
|
| 84 |
+
})
|
| 85 |
+
|
| 86 |
+
return {"top_labels": top_labels}
|
| 87 |
+
|
| 88 |
+
def predict(self, payload):
|
| 89 |
+
start = time.perf_counter()
|
| 90 |
+
|
| 91 |
+
# Safely extract text fields
|
| 92 |
+
subject = getattr(payload, "subject", "") or ""
|
| 93 |
+
|
| 94 |
+
text = ""
|
| 95 |
+
if getattr(payload, "content", None):
|
| 96 |
+
text = getattr(payload.content, "text", "") or ""
|
| 97 |
+
|
| 98 |
+
# Construct input text string
|
| 99 |
+
input_text = f"{subject} {text}".strip()
|
| 100 |
+
if not input_text:
|
| 101 |
+
input_text = "empty message"
|
| 102 |
+
|
| 103 |
+
# Invoke inference explicitly without failing LCEL wrapper
|
| 104 |
+
result_dict = self._run_inference(input_text)
|
| 105 |
+
|
| 106 |
+
elapsed = int((time.perf_counter() - start) * 1000)
|
| 107 |
+
|
| 108 |
+
return {
|
| 109 |
+
"top_labels": result_dict["top_labels"],
|
| 110 |
+
"model_version": self.model_version,
|
| 111 |
+
"time_ms": elapsed
|
| 112 |
+
}
|
app/models/model2_risk.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import torch
|
| 3 |
+
from typing import Dict, Any
|
| 4 |
+
from langchain_core.runnables import RunnableLambda
|
| 5 |
+
from transformers import pipeline
|
| 6 |
+
|
| 7 |
+
class Model2RiskEngine:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.model_name = "microsoft/deberta-v3-base"
|
| 10 |
+
self.model_version = "model2-v3"
|
| 11 |
+
|
| 12 |
+
if torch.backends.mps.is_available():
|
| 13 |
+
device = "mps"
|
| 14 |
+
elif torch.cuda.is_available():
|
| 15 |
+
device = "cuda"
|
| 16 |
+
else:
|
| 17 |
+
device = "cpu"
|
| 18 |
+
|
| 19 |
+
# Initialize the DeBERTa model for sequence classification to identify
|
| 20 |
+
# specific risk factors inside the email body
|
| 21 |
+
try:
|
| 22 |
+
self.analyzer = pipeline(
|
| 23 |
+
"zero-shot-classification",
|
| 24 |
+
model=self.model_name,
|
| 25 |
+
device=device
|
| 26 |
+
)
|
| 27 |
+
except Exception as e:
|
| 28 |
+
print(f"Could not load zero-shot pipeline for {self.model_name}: {e}. Falling back to default.")
|
| 29 |
+
self.analyzer = None
|
| 30 |
+
|
| 31 |
+
self.candidate_reasons = [
|
| 32 |
+
"Contains urgent financial request",
|
| 33 |
+
"Suspicious domain detected",
|
| 34 |
+
"Requests sensitive information"
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
def _run_nlp_analysis(self, text: str) -> list:
|
| 38 |
+
reasons = []
|
| 39 |
+
if self.analyzer is not None and text.strip():
|
| 40 |
+
try:
|
| 41 |
+
result = self.analyzer(text, candidate_labels=self.candidate_reasons)
|
| 42 |
+
# Parse through probabilities
|
| 43 |
+
for label, score in zip(result['labels'], result['scores']):
|
| 44 |
+
if score > 0.4:
|
| 45 |
+
reasons.append(label)
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Model 2 inference error: {e}")
|
| 48 |
+
return reasons
|
| 49 |
+
|
| 50 |
+
def analyze(self, payload: Any, model1_result: Dict) -> Dict[str, Any]:
|
| 51 |
+
start = time.perf_counter()
|
| 52 |
+
|
| 53 |
+
reasons = []
|
| 54 |
+
risk_score = 0
|
| 55 |
+
|
| 56 |
+
# Construct input text string
|
| 57 |
+
text = ""
|
| 58 |
+
subject = ""
|
| 59 |
+
if payload.content and payload.content.text:
|
| 60 |
+
text = payload.content.text.lower()
|
| 61 |
+
if payload.subject:
|
| 62 |
+
subject = payload.subject.lower()
|
| 63 |
+
|
| 64 |
+
full_text = f"{subject} {text}".strip()
|
| 65 |
+
|
| 66 |
+
# -------------------------------------------------------------------------
|
| 67 |
+
# 1. Model 1 Integration Score
|
| 68 |
+
# -------------------------------------------------------------------------
|
| 69 |
+
if model1_result and "top_labels" in model1_result and len(model1_result["top_labels"]) > 0:
|
| 70 |
+
top_label = model1_result["top_labels"][0]["label"]
|
| 71 |
+
top_prob = model1_result["top_labels"][0]["probability"]
|
| 72 |
+
|
| 73 |
+
if top_label == "benign":
|
| 74 |
+
# Inverse penalty if benign
|
| 75 |
+
risk_score = max(0, 20 * (1 - top_prob))
|
| 76 |
+
else:
|
| 77 |
+
risk_score = top_prob * 60
|
| 78 |
+
reasons.append(f"Message classified as potential {top_label}")
|
| 79 |
+
|
| 80 |
+
# -------------------------------------------------------------------------
|
| 81 |
+
# 2. DeBERTa NLP Analysis
|
| 82 |
+
# -------------------------------------------------------------------------
|
| 83 |
+
nlp_reasons = self._run_nlp_analysis(full_text)
|
| 84 |
+
for r in nlp_reasons:
|
| 85 |
+
if r not in reasons:
|
| 86 |
+
reasons.append(r)
|
| 87 |
+
risk_score += 10 # Bump risk for each NLP factor identified
|
| 88 |
+
|
| 89 |
+
# -------------------------------------------------------------------------
|
| 90 |
+
# 3. Structural Heuristics
|
| 91 |
+
# -------------------------------------------------------------------------
|
| 92 |
+
if payload.metadata and getattr(payload.metadata, "has_attachment", False):
|
| 93 |
+
risk_score += 15
|
| 94 |
+
reasons.append("Message contains attachment")
|
| 95 |
+
|
| 96 |
+
# Link Analysis
|
| 97 |
+
if payload.links:
|
| 98 |
+
for link in payload.links:
|
| 99 |
+
domain = (getattr(link, "domain", "") or "").lower()
|
| 100 |
+
for keyword in ["login", "verify", "secure", "update", "account"]:
|
| 101 |
+
if keyword in domain:
|
| 102 |
+
if "Suspicious verification-style domain detected" not in reasons:
|
| 103 |
+
risk_score += 15
|
| 104 |
+
reasons.append("Suspicious verification-style domain detected")
|
| 105 |
+
break
|
| 106 |
+
|
| 107 |
+
# Sender Mismatch
|
| 108 |
+
if payload.sender and getattr(payload.sender, "email", None) and getattr(payload.sender, "domain", None):
|
| 109 |
+
email_domain = payload.sender.email.split("@")[-1].lower()
|
| 110 |
+
declared_domain = payload.sender.domain.lower()
|
| 111 |
+
if email_domain != declared_domain:
|
| 112 |
+
risk_score += 15
|
| 113 |
+
reasons.append("Sender domain mismatch detected")
|
| 114 |
+
|
| 115 |
+
# -------------------------------------------------------------------------
|
| 116 |
+
# Clamp Risk Score and Classify Strategy
|
| 117 |
+
# -------------------------------------------------------------------------
|
| 118 |
+
risk_score = min(100, int(risk_score))
|
| 119 |
+
|
| 120 |
+
if risk_score >= 75:
|
| 121 |
+
risk_level = "high"
|
| 122 |
+
elif risk_score >= 50:
|
| 123 |
+
risk_level = "risky"
|
| 124 |
+
elif risk_score >= 25:
|
| 125 |
+
risk_level = "suspicious"
|
| 126 |
+
else:
|
| 127 |
+
risk_level = "safe"
|
| 128 |
+
|
| 129 |
+
elapsed = int((time.perf_counter() - start) * 1000)
|
| 130 |
+
|
| 131 |
+
# Ensure reasons list handles duplicates
|
| 132 |
+
unique_reasons = []
|
| 133 |
+
for r in reasons:
|
| 134 |
+
if r not in unique_reasons:
|
| 135 |
+
unique_reasons.append(r)
|
| 136 |
+
|
| 137 |
+
return {
|
| 138 |
+
"risk_score": risk_score,
|
| 139 |
+
"risk_level": risk_level,
|
| 140 |
+
"reasons": unique_reasons,
|
| 141 |
+
"model_version": self.model_version,
|
| 142 |
+
"time_ms": elapsed,
|
| 143 |
+
}
|
app/models/model3_action.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List, Dict, Any, Optional
|
| 4 |
+
|
| 5 |
+
from langchain_core.prompts import PromptTemplate
|
| 6 |
+
from langchain_core.runnables import RunnableLambda
|
| 7 |
+
from transformers import pipeline
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger("email-security-ai.model3")
|
| 10 |
+
logger.addHandler(logging.NullHandler())
|
| 11 |
+
|
| 12 |
+
class Model3ActionGenerator:
|
| 13 |
+
"""
|
| 14 |
+
Generates user safety actions using google/flan-t5-base
|
| 15 |
+
via LangChain LCEL implementation
|
| 16 |
+
"""
|
| 17 |
+
def __init__(self, model_name: str = "google/flan-t5-base", device: str = "cpu"):
|
| 18 |
+
self.model_name = model_name
|
| 19 |
+
self.model_version = "model3-v2"
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
self.generator = pipeline(
|
| 23 |
+
task="text2text-generation",
|
| 24 |
+
model=self.model_name,
|
| 25 |
+
device=device
|
| 26 |
+
)
|
| 27 |
+
logger.info("Model3 loaded successfully: %s", self.model_name)
|
| 28 |
+
except Exception as e:
|
| 29 |
+
logger.exception("Failed to load Model3: %s", e)
|
| 30 |
+
self.generator = None
|
| 31 |
+
|
| 32 |
+
# Build prompt template
|
| 33 |
+
self.prompt = PromptTemplate.from_template(
|
| 34 |
+
"Based on a {risk_level} email risk with these reasons: {reasons_text}. "
|
| 35 |
+
"Relevant cybersecurity knowledge: {evidence_text}. "
|
| 36 |
+
"What should the user do? List 3 specific short actionable steps separated by commas."
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
def _run_generation(self, prompt_text: str) -> str:
|
| 40 |
+
if self.generator is None:
|
| 41 |
+
return ""
|
| 42 |
+
try:
|
| 43 |
+
result = self.generator(
|
| 44 |
+
prompt_text.text if hasattr(prompt_text, "text") else str(prompt_text),
|
| 45 |
+
max_new_tokens=60,
|
| 46 |
+
do_sample=False,
|
| 47 |
+
num_return_sequences=1,
|
| 48 |
+
repetition_penalty=1.2,
|
| 49 |
+
no_repeat_ngram_size=3
|
| 50 |
+
)
|
| 51 |
+
return result[0]['generated_text']
|
| 52 |
+
except Exception as e:
|
| 53 |
+
logger.exception("Model 3 generation failed: %s", e)
|
| 54 |
+
return ""
|
| 55 |
+
|
| 56 |
+
def _parse_output(self, generated_text: str) -> List[str]:
|
| 57 |
+
if not generated_text:
|
| 58 |
+
return []
|
| 59 |
+
|
| 60 |
+
# Split by both commas and periods to break up run-on and repeating sentences
|
| 61 |
+
raw_actions = [a.strip() for a in generated_text.replace(".", ",").split(",") if a.strip()]
|
| 62 |
+
|
| 63 |
+
# Deduplicate while preserving order
|
| 64 |
+
actions = []
|
| 65 |
+
for a in raw_actions:
|
| 66 |
+
if a.lower() not in [existing.lower() for existing in actions]:
|
| 67 |
+
actions.append(a)
|
| 68 |
+
|
| 69 |
+
return actions[:5]
|
| 70 |
+
|
| 71 |
+
def _fallback_actions(self) -> List[str]:
|
| 72 |
+
return [
|
| 73 |
+
"Do not click any links in the message.",
|
| 74 |
+
"Verify the sender through the official website.",
|
| 75 |
+
"Report the message as phishing.",
|
| 76 |
+
"Delete the suspicious message."
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
def generate(self, risk_analysis: Dict[str, Any], rag_evidence: List[str]) -> Dict[str, Any]:
|
| 80 |
+
start = time.perf_counter()
|
| 81 |
+
|
| 82 |
+
risk_level = risk_analysis.get("risk_level", "unknown")
|
| 83 |
+
reasons = risk_analysis.get("reasons", [])
|
| 84 |
+
|
| 85 |
+
reasons_text = ", ".join(reasons) if reasons else "None"
|
| 86 |
+
evidence_text = " ".join(rag_evidence) if rag_evidence else "None"
|
| 87 |
+
|
| 88 |
+
# Format prompt using LangChain and execute
|
| 89 |
+
prompt_value = self.prompt.format(
|
| 90 |
+
risk_level=risk_level,
|
| 91 |
+
reasons_text=reasons_text,
|
| 92 |
+
evidence_text=evidence_text
|
| 93 |
+
)
|
| 94 |
+
actions = self._parse_output(self._run_generation(prompt_value))
|
| 95 |
+
|
| 96 |
+
if not actions:
|
| 97 |
+
actions = self._fallback_actions()
|
| 98 |
+
|
| 99 |
+
elapsed = int((time.perf_counter() - start) * 1000)
|
| 100 |
+
|
| 101 |
+
return {
|
| 102 |
+
"actions": actions,
|
| 103 |
+
"model_version": self.model_version,
|
| 104 |
+
"time_ms": elapsed
|
| 105 |
+
}
|
app/notebooks/__init__.py
ADDED
|
File without changes
|
app/pipelines/__init__.py
ADDED
|
File without changes
|
app/pipelines/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (157 Bytes). View file
|
|
|
app/pipelines/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (173 Bytes). View file
|
|
|
app/pipelines/__pycache__/security_pipeline.cpython-310.pyc
ADDED
|
Binary file (1.69 kB). View file
|
|
|
app/pipelines/__pycache__/security_pipeline.cpython-311.pyc
ADDED
|
Binary file (2.68 kB). View file
|
|
|
app/pipelines/security_pipeline.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/pipeline/security_pipeline.py
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
from app.models.model1_classifier import Model1Classifier
|
| 5 |
+
from app.models.model2_risk import Model2RiskEngine
|
| 6 |
+
from app.models.model3_action import Model3ActionGenerator
|
| 7 |
+
from app.rag.rag_engine import RAGEngine
|
| 8 |
+
|
| 9 |
+
from app.schemas.response_schema import (
|
| 10 |
+
AnalyzeResponse,
|
| 11 |
+
Classification,
|
| 12 |
+
RiskAnalysis,
|
| 13 |
+
Metadata,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
# -------- LAZY MODEL LOADING --------
|
| 17 |
+
|
| 18 |
+
model1 = None
|
| 19 |
+
model2 = None
|
| 20 |
+
model3 = None
|
| 21 |
+
rag_engine = None
|
| 22 |
+
|
| 23 |
+
def load_models():
|
| 24 |
+
global model1, model2, model3, rag_engine
|
| 25 |
+
if model1 is None:
|
| 26 |
+
import os
|
| 27 |
+
os.makedirs("data/vector_store", exist_ok=True)
|
| 28 |
+
model1 = Model1Classifier()
|
| 29 |
+
model2 = Model2RiskEngine()
|
| 30 |
+
model3 = Model3ActionGenerator()
|
| 31 |
+
rag_engine = RAGEngine()
|
| 32 |
+
# Optionally ensure index is loaded
|
| 33 |
+
if hasattr(rag_engine, "rebuild_index"):
|
| 34 |
+
rag_engine.rebuild_index()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def analyze(payload):
|
| 38 |
+
load_models()
|
| 39 |
+
start_total = time.perf_counter()
|
| 40 |
+
|
| 41 |
+
# -------------------------------------------------------------
|
| 42 |
+
# 1. RUN MODEL 1 (SPAM / INTENT CLASSIFICATION)
|
| 43 |
+
# -------------------------------------------------------------
|
| 44 |
+
model1_result = model1.predict(payload)
|
| 45 |
+
|
| 46 |
+
# -------------------------------------------------------------
|
| 47 |
+
# 2. RUN MODEL 2 (RISK ENGINE)
|
| 48 |
+
# -------------------------------------------------------------
|
| 49 |
+
model2_result = model2.analyze(
|
| 50 |
+
payload,
|
| 51 |
+
model1_result
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# -------------------------------------------------------------
|
| 55 |
+
# 3. RAG RETRIEVAL (EVIDENCE GATHERING)
|
| 56 |
+
# -------------------------------------------------------------
|
| 57 |
+
text_content = payload.content.text if payload.content and payload.content.text else ""
|
| 58 |
+
subject = payload.subject if payload.subject else ""
|
| 59 |
+
query = f"{subject} {text_content}".strip()
|
| 60 |
+
rag_evidence = rag_engine.get_evidence(query) if getattr(rag_engine, "get_evidence", None) else []
|
| 61 |
+
|
| 62 |
+
# -------------------------------------------------------------
|
| 63 |
+
# 4. RUN MODEL 3 (ACTION GENERATION)
|
| 64 |
+
# -------------------------------------------------------------
|
| 65 |
+
model3_result = model3.generate(
|
| 66 |
+
{"risk_level": model2_result.get("risk_level", "unknown"), "reasons": model2_result.get("reasons", [])},
|
| 67 |
+
rag_evidence
|
| 68 |
+
)
|
| 69 |
+
recommended_actions = model3_result.get("actions", [])
|
| 70 |
+
|
| 71 |
+
# -------------------------------------------------------------
|
| 72 |
+
# TOTAL LATENCY METRICS
|
| 73 |
+
# -------------------------------------------------------------
|
| 74 |
+
total_time = int((time.perf_counter() - start_total) * 1000)
|
| 75 |
+
|
| 76 |
+
metadata = Metadata(
|
| 77 |
+
processing_time_ms=total_time,
|
| 78 |
+
models={
|
| 79 |
+
"model1": "facebook/bart-large-mnli",
|
| 80 |
+
"model2": "microsoft/deberta-v3-base",
|
| 81 |
+
"model3": "google/flan-t5-base"
|
| 82 |
+
}
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# -------------------------------------------------------------
|
| 86 |
+
# BUILD RESPONSE
|
| 87 |
+
# -------------------------------------------------------------
|
| 88 |
+
response = AnalyzeResponse(
|
| 89 |
+
classification=Classification(
|
| 90 |
+
top_labels=model1_result["top_labels"],
|
| 91 |
+
model_version=model1_result["model_version"]
|
| 92 |
+
),
|
| 93 |
+
risk_analysis=RiskAnalysis(
|
| 94 |
+
risk_score=model2_result["risk_score"],
|
| 95 |
+
risk_level=model2_result["risk_level"],
|
| 96 |
+
reasons=model2_result["reasons"],
|
| 97 |
+
model_version=model2_result["model_version"]
|
| 98 |
+
),
|
| 99 |
+
recommended_actions=recommended_actions,
|
| 100 |
+
rag_evidence=rag_evidence,
|
| 101 |
+
metadata=metadata
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
return response
|
app/rag.py
ADDED
|
File without changes
|
app/rag/__init__.py
ADDED
|
File without changes
|
app/rag/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (151 Bytes). View file
|
|
|
app/rag/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (167 Bytes). View file
|
|
|
app/rag/__pycache__/knowledge_loader.cpython-310.pyc
ADDED
|
Binary file (742 Bytes). View file
|
|
|
app/rag/__pycache__/knowledge_loader.cpython-311.pyc
ADDED
|
Binary file (1.19 kB). View file
|
|
|
app/rag/__pycache__/rag_engine.cpython-310.pyc
ADDED
|
Binary file (2.95 kB). View file
|
|
|