Spaces:

Dpedrinho01
/

api_host

Sleeping

App Files Files Community

Daniel Pedrinho commited on Mar 25

Commit

13a9adc

1 Parent(s): c37c122

Model Commit

Browse files

Files changed (14) hide show

.gitignore +1 -0
api.py +271 -0
models/electra_large_final/.gitattributes +1 -0
models/electra_large_final/config.json +34 -0
models/electra_large_final/threshold_config.json +15 -0
models/electra_large_final/tokenizer.json +0 -0
models/electra_large_final/tokenizer_config.json +14 -0
models/electra_large_final/training_args.bin +3 -0
models/roberta_large_final/.gitattributes +1 -0
models/roberta_large_final/config.json +28 -0
models/roberta_large_final/threshold_config.json +15 -0
models/roberta_large_final/tokenizer.json +0 -0
models/roberta_large_final/tokenizer_config.json +16 -0
models/roberta_large_final/training_args.bin +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ token

api.py ADDED Viewed

	@@ -0,0 +1,271 @@

+"""
+Spam Detection API
+Ensemble of RoBERTa-Large + ELECTRA-Large classifiers.
+Run with: uvicorn api:app --reload
+"""
+import json
+import os
+from pathlib import Path
+from typing import Optional
+import email
+from email import policy as email_policy
+import numpy as np
+import torch
+from fastapi import FastAPI, HTTPException, UploadFile, File
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from transformers import (
+    AutoTokenizer,
+    ElectraForSequenceClassification,
+    RobertaForSequenceClassification,
+)
+# ── Config ────────────────────────────────────────────────────────────────────
+BASE_DIR = Path(__file__).parent
+MODELS_DIR = BASE_DIR / "models"
+ROBERTA_DIR = MODELS_DIR / "roberta_large_final"
+ELECTRA_DIR = MODELS_DIR / "electra_large_final"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MAYBE_SPAM_UPPER = 0.50   # [threshold, MAYBE_SPAM_UPPER) → "maybe spam"
+# ── App ───────────────────────────────────────────────────────────────────────
+app = FastAPI(
+    title="Spam Detection API",
+    description="Ensemble of RoBERTa-Large + ELECTRA-Large for spam/ham classification.",
+    version="1.0.0",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["https://pedrinho-dev01.github.io/gone-phishing/"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── Model loading ─────────────────────────────────────────────────────────────
+class ModelBundle:
+    def __init__(self, model_dir: Path, model_class, tokenizer_class=None):
+        self.model_dir = model_dir
+        self.tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
+        self.model = model_class.from_pretrained(str(model_dir))
+        self.model.to(DEVICE)
+        self.model.eval()
+        threshold_path = model_dir / "threshold_config.json"
+        with open(threshold_path) as f:
+            cfg = json.load(f)
+        self.threshold: float = cfg["recommended_threshold"]
+    @torch.no_grad()
+    def predict_proba(self, text: str) -> float:
+        """Return P(spam) as a float in [0, 1]."""
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512,
+            padding=True,
+        )
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        logits = self.model(**inputs).logits  # shape (1, 2)
+        proba = torch.softmax(logits, dim=-1)[0, 1].item()  # P(class=1 / spam)
+        return proba
+roberta_bundle: Optional[ModelBundle] = None
+electra_bundle: Optional[ModelBundle] = None
+@app.on_event("startup")
+def load_models():
+    global roberta_bundle, electra_bundle
+    print("Loading RoBERTa …")
+    roberta_bundle = ModelBundle(ROBERTA_DIR, RobertaForSequenceClassification)
+    print("Loading ELECTRA …")
+    electra_bundle = ModelBundle(ELECTRA_DIR, ElectraForSequenceClassification)
+    print(f"Models loaded on {DEVICE}.")
+# ── Schemas ───────────────────────────────────────────────────────────────────
+class PredictRequest(BaseModel):
+    text: str
+    model: str = "ensemble"  # "ensemble" | "roberta" | "electra"
+class ModelResult(BaseModel):
+    spam_probability: float
+    is_spam: bool
+    threshold: float
+class PredictResponse(BaseModel):
+    text: str
+    model_used: str
+    is_spam: bool
+    maybe_spam: bool
+    spam_probability: float
+    ensemble_threshold: float
+    maybe_spam_upper_threshold: float
+    roberta: Optional[ModelResult] = None
+    electra: Optional[ModelResult] = None
+# ── Helpers ───────────────────────────────────────────────────────────────────
+def classify(proba: float, threshold: float) -> dict:
+    """Return is_spam and maybe_spam flags for a given probability."""
+    maybe_spam = threshold <= proba < MAYBE_SPAM_UPPER
+    is_spam    = proba >= MAYBE_SPAM_UPPER
+    return {"is_spam": is_spam, "maybe_spam": maybe_spam}
+# ── Endpoints ─────────────────────────────────────────────────────────────────
+@app.get("/")
+def root():
+    return {"status": "ok", "message": "Spam Detection API is running."}
+@app.get("/health")
+def health():
+    return {
+        "status": "healthy",
+        "device": DEVICE,
+        "models_loaded": roberta_bundle is not None and electra_bundle is not None,
+    }
+@app.post("/predict", response_model=PredictResponse)
+def predict(req: PredictRequest):
+    if not req.text.strip():
+        raise HTTPException(status_code=422, detail="text must not be empty.")
+    model_key = req.model.lower()
+    if model_key not in ("ensemble", "roberta", "electra"):
+        raise HTTPException(status_code=422, detail="model must be 'ensemble', 'roberta', or 'electra'.")
+    roberta_proba = roberta_bundle.predict_proba(req.text)
+    electra_proba = electra_bundle.predict_proba(req.text)
+    roberta_result = ModelResult(
+        spam_probability=round(roberta_proba, 4),
+        is_spam=roberta_proba >= MAYBE_SPAM_UPPER,
+        threshold=roberta_bundle.threshold,
+    )
+    electra_result = ModelResult(
+        spam_probability=round(electra_proba, 4),
+        is_spam=electra_proba >= MAYBE_SPAM_UPPER,
+        threshold=electra_bundle.threshold,
+    )
+    if model_key == "roberta":
+        final_proba = roberta_proba
+        ensemble_threshold = roberta_bundle.threshold
+    elif model_key == "electra":
+        final_proba = electra_proba
+        ensemble_threshold = electra_bundle.threshold
+    else:
+        # Ensemble: average the two probabilities, use average threshold
+        final_proba = (roberta_proba + electra_proba) / 2
+        ensemble_threshold = (roberta_bundle.threshold + electra_bundle.threshold) / 2
+    flags = classify(final_proba, ensemble_threshold)
+    return PredictResponse(
+        text=req.text,
+        model_used=model_key,
+        is_spam=flags["is_spam"],
+        maybe_spam=flags["maybe_spam"],
+        spam_probability=round(final_proba, 4),
+        ensemble_threshold=ensemble_threshold,
+        maybe_spam_upper_threshold=MAYBE_SPAM_UPPER,
+        roberta=roberta_result,
+        electra=electra_result,
+    )
+@app.post("/predict/batch")
+def predict_batch(texts: list[str], model: str = "ensemble"):
+    if len(texts) > 50:
+        raise HTTPException(status_code=422, detail="Batch size limit is 50.")
+    results = []
+    for text in texts:
+        req = PredictRequest(text=text, model=model)
+        results.append(predict(req))
+    return results
+# ── EML helper ────────────────────────────────────────────────────────────────
+def extract_text_from_eml(raw_bytes: bytes) -> str:
+    """Parse a .eml file and return a single string with subject + body text."""
+    msg = email.message_from_bytes(raw_bytes, policy=email_policy.default)
+    parts = []
+    # Subject line
+    subject = msg.get("subject", "")
+    if subject:
+        parts.append(f"Subject: {subject}")
+    # From / To for extra signal
+    from_addr = msg.get("from", "")
+    if from_addr:
+        parts.append(f"From: {from_addr}")
+    # Walk MIME parts for text content
+    if msg.is_multipart():
+        for part in msg.walk():
+            ct = part.get_content_type()
+            cd = str(part.get("Content-Disposition", ""))
+            if ct == "text/plain" and "attachment" not in cd:
+                parts.append(part.get_content())
+            elif ct == "text/html" and "attachment" not in cd and not any(p.startswith("Subject") or "plain" in p for p in parts):
+                # Fallback to HTML only if no plain text found
+                import html as html_lib
+                raw_html = part.get_content()
+                # Very light strip — remove tags
+                import re
+                text = re.sub(r"<[^>]+>", " ", raw_html)
+                text = html_lib.unescape(text)
+                text = re.sub(r"\s+", " ", text).strip()
+                parts.append(text)
+    else:
+        parts.append(msg.get_content())
+    return "\n".join(parts).strip()
+@app.post("/predict/eml", response_model=PredictResponse)
+async def predict_eml(file: UploadFile = File(...)):
+    if not file.filename.endswith(".eml"):
+        raise HTTPException(status_code=422, detail="Only .eml files are accepted.")
+    raw = await file.read()
+    if len(raw) > 5 * 1024 * 1024:  # 5 MB guard
+        raise HTTPException(status_code=413, detail="File too large (max 5 MB).")
+    try:
+        text = extract_text_from_eml(raw)
+    except Exception as e:
+        raise HTTPException(status_code=422, detail=f"Failed to parse .eml: {e}")
+    if not text.strip():
+        raise HTTPException(status_code=422, detail="Could not extract any text from the .eml file.")
+    analyzed_text = text.strip()
+    print("\n=== [EMAIL SCAN] Content analyzed ===")
+    print(analyzed_text)
+    print("=== [END EMAIL CONTENT] ===\n")
+    # Reuse the existing ensemble prediction logic
+    return predict(PredictRequest(text=analyzed_text, model="ensemble"))

models/electra_large_final/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ model.safetensors filter=lfs diff=lfs merge=lfs -text

models/electra_large_final/config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "add_cross_attention": false,
+  "architectures": [
+    "ElectraForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": null,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "embedding_size": 1024,
+  "eos_token_id": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "is_decoder": false,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "electra",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "summary_activation": "gelu",
+  "summary_last_dropout": 0.1,
+  "summary_type": "first",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.3.0",
+  "type_vocab_size": 2,
+  "use_cache": false,
+  "vocab_size": 30522
+}

models/electra_large_final/threshold_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "recommended_threshold": 0.35,
+  "standard_metrics": {
+    "accuracy": 0.9256,
+    "f1": 0.9051987767584098,
+    "precision": 0.9230769230769231,
+    "recall": 0.888
+  },
+  "custom_metrics": {
+    "accuracy": 0.9256,
+    "f1": 0.9055837563451776,
+    "precision": 0.9195876288659793,
+    "recall": 0.892
+  }
+}

models/electra_large_final/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/electra_large_final/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "is_local": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

models/electra_large_final/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e251fe80c570139a5ddea6518864f1ccf76ef6536208c2d234507ba2c06c2b9
+size 4856

models/roberta_large_final/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ model.safetensors filter=lfs diff=lfs merge=lfs -text

models/roberta_large_final/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "add_cross_attention": false,
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "is_decoder": false,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.3.0",
+  "type_vocab_size": 1,
+  "use_cache": false,
+  "vocab_size": 50265
+}

models/roberta_large_final/threshold_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "recommended_threshold": 0.35,
+  "standard_metrics": {
+    "accuracy": 0.9352,
+    "f1": 0.916923076923077,
+    "precision": 0.9410526315789474,
+    "recall": 0.894
+  },
+  "custom_metrics": {
+    "accuracy": 0.9336,
+    "f1": 0.9150460593654043,
+    "precision": 0.9371069182389937,
+    "recall": 0.894
+  }
+}

models/roberta_large_final/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/roberta_large_final/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

models/roberta_large_final/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf7746da523087b4c98b10face3adad900b52a4c3ab325a7207442bec1e9eddb
+size 4856