Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 8, 2025

Commit

d2fbc7d

verified ·

1 Parent(s): ad3f1d2

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -88

app.py CHANGED Viewed

@@ -1,98 +1,187 @@
 import os
 from typing import List, Optional, Dict
 import torch
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-# Prefer MODEL_ID, fall back to HF_MODEL_ID, then default
-MODEL_ID = (
-    os.environ.get("MODEL_ID")
-    or os.environ.get("HF_MODEL_ID")
-    or "Perth0603/phishing-email-mobilebert"
-)
-app = FastAPI(title="Phishing Text Classifier (Model-Authoritative)", version="1.0.0")
 class PredictPayload(BaseModel):
     inputs: str
 class BatchPredictPayload(BaseModel):
     inputs: List[str]
 class LabeledText(BaseModel):
     text: str
-    label: Optional[str] = None  # optional ground truth for quick eval (accepts text)
 class EvalPayload(BaseModel):
     samples: List[LabeledText]
 _tokenizer = None
 _model = None
 _device = "cpu"
-# Cached normalized mapping/meta
-_NORM_LABELS_BY_IDX = None  # normalized labels ordered by model indices
-def _normalize_label_text_only(txt: str) -> str:
-    """
-    Normalize model label text to PHISH/LEGIT when possible.
-    If unfamiliar, return the uppercased original token.
-    """
     t = (str(txt) if txt is not None else "").strip().upper()
-    if t in ("PHISHING", "PHISH", "SPAM"):
         return "PHISH"
-    if t in ("LEGIT", "LEGITIMATE", "SAFE", "HAM"):
         return "LEGIT"
-    # keep other label names as-is (uppercased) so we don't force an incorrect mapping
     return t
 def _load_model():
-    global _tokenizer, _model, _device, _NORM_LABELS_BY_IDX
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
         _model.to(_device)
-        _model.eval()  # important: disable dropout etc.
-        # Warm-up (silent)
         with torch.no_grad():
             _ = _model(
                 **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512)
                 .to(_device)
             ).logits
-        # Read and normalize model labels (by index)
-        id2label = getattr(_model.config, "id2label", {}) or {}
-        num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
-        _NORM_LABELS_BY_IDX = [_normalize_label_text_only(id2label.get(i, f"LABEL_{i}")) for i in range(num_labels)]
-def _predict_texts(texts: List[str]) -> List[Dict]:
-    """
-    Predict and return strictly model-authoritative outputs:
-    - label: normalized model label (PHISH/LEGIT or other model label uppercased)
-    - raw_label: original id2label string from model.config
-    - is_phish: boolean derived from normalized label (True if normalized == "PHISH")
-    - score: probability of predicted class
-    - probs: dict of normalized label -> probability (or CLASS_i keys if unknown)
-    - predicted_index: argmax index
-    """
     _load_model()
     if not texts:
         return []
-    # Tokenize batch
     enc = _tokenizer(
         texts,
         return_tensors="pt",
@@ -102,95 +191,120 @@ def _predict_texts(texts: List[str]) -> List[Dict]:
     )
     enc = {k: v.to(_device) for k, v in enc.items()}
     with torch.no_grad():
         logits = _model(**enc).logits
-        probs = torch.softmax(logits, dim=-1)  # [batch, num_labels]
-    # Use the model’s own mapping
-    id2label = getattr(_model.config, "id2label", None) or {}
-    labels_by_idx_raw = [id2label.get(i, f"LABEL_{i}") for i in range(probs.shape[-1])]
-    # normalized labels where possible
-    labels_by_idx_norm = [_normalize_label_text_only(lbl) for lbl in labels_by_idx_raw]
     outputs: List[Dict] = []
-    for i in range(probs.shape[0]):
-        p = probs[i]
-        idx = int(torch.argmax(p).item())
-        raw_label = labels_by_idx_raw[idx]
-        norm_label = labels_by_idx_norm[idx]  # normalized where possible
-        # Build probability map keyed by normalized labels when available,
-        # otherwise fallback to CLASS_i keys to avoid collision
-        prob_map: Dict[str, float] = {}
-        for j, lbl_norm in enumerate(labels_by_idx_norm):
-            key = lbl_norm if lbl_norm in ("PHISH", "LEGIT") else f"CLASS_{j}"
-            prob_map[key] = float(p[j].item())
-        outputs.append(
-            {
-                "label": norm_label,                    # authoritative label (model-driven, normalized)
-                "raw_label": raw_label,                 # original model id2label value
-                "is_phish": True if norm_label == "PHISH" else False,
-                "score": float(p[idx].item()),          # probability of predicted class
-                "probs": prob_map,                      # per-class probabilities (keys normalized or CLASS_i)
-                "predicted_index": idx,
-            }
-        )
     return outputs
 @app.get("/")
 def root():
     _load_model()
     return {
         "status": "ok",
         "model": MODEL_ID,
-        "note": "This service returns predictions exactly as the model decides (label derived from model.config.id2label). Frontend should use `label` or `is_phish` as authority."
     }
 @app.get("/debug/labels")
 def debug_labels():
     _load_model()
     return {
         "id2label": getattr(_model.config, "id2label", {}),
         "label2id": getattr(_model.config, "label2id", {}),
         "num_labels": int(getattr(_model.config, "num_labels", 0)),
         "device": _device,
-        "norm_labels_by_idx": _NORM_LABELS_BY_IDX,
     }
 @app.post("/predict")
 def predict(payload: PredictPayload):
     try:
-        res = _predict_texts([payload.inputs])
         return res[0]
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
 @app.post("/predict-batch")
 def predict_batch(payload: BatchPredictPayload):
     try:
-        return _predict_texts(payload.inputs)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Batch prediction error: {e}")
 @app.post("/evaluate")
 def evaluate(payload: EvalPayload):
-    """
-    Quick on-the-spot test with provided labeled samples.
-    The provided labels are interpreted as text labels (PHISH/LEGIT/etc.) — evaluation is done
-    by comparing normalized GT text to model's normalized prediction (no 0/1 dataset mapping applied).
-    """
     try:
         texts = [s.text for s in payload.samples]
-        gts = [(_normalize_label_text_only(s.label) if s.label is not None else None) for s in payload.samples]
-        preds = _predict_texts(texts)
         total = len(preds)
         correct = 0
@@ -209,16 +323,16 @@ def evaluate(payload: EvalPayload):
         acc = (correct / sum(1 for gt in gts if gt is not None)) if has_gts else None
         return {
-            "accuracy": acc,
             "total": total,
             "predictions": preds,
             "per_class": per_class,
         }
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Evaluation error: {e}")
 if __name__ == "__main__":
-    # Run:  uvicorn app:app --host 0.0.0.0 --port 8000 --reload
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import os
 from typing import List, Optional, Dict
+import re
 import torch
+import torch.nn.functional as F
+import nltk
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer, WordNetLemmatizer
+from nltk.tokenize import word_tokenize
+from textblob import TextBlob
+# Download NLTK data
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+    nltk.download('stopwords')
+    nltk.download('wordnet')
+MODEL_ID = "Perth0603/phishing-email-mobilebert"
+app = FastAPI(title="Phishing Text Classifier with Preprocessing", version="1.0.0")
+# Temperature for softening predictions (1.0 = normal, >1.0 = softer, <1.0 = sharper)
+TEMPERATURE = 2.5  # Adjust this value (try 1.5 to 3.0)
+# ============================================================================
+# TEXT PREPROCESSING CLASS
+# ============================================================================
+class TextPreprocessor:
+    """NLP preprocessing for analysis and feature extraction"""
+    def __init__(self):
+        self.stemmer = PorterStemmer()
+        self.lemmatizer = WordNetLemmatizer()
+        self.stop_words = set(stopwords.words('english'))
+    def tokenize(self, text: str) -> List[str]:
+        """Break text into tokens"""
+        return word_tokenize(text.lower())
+    def remove_stopwords(self, tokens: List[str]) -> List[str]:
+        """Remove common stop words"""
+        return [token for token in tokens if token.isalnum() and token not in self.stop_words]
+    def stem(self, tokens: List[str]) -> List[str]:
+        """Reduce tokens to stems"""
+        return [self.stemmer.stem(token) for token in tokens]
+    def lemmatize(self, tokens: List[str]) -> List[str]:
+        """Reduce tokens to lemmas"""
+        return [self.lemmatizer.lemmatize(token) for token in tokens]
+    def sentiment_analysis(self, text: str) -> Dict:
+        """Analyze sentiment and phishing indicators"""
+        blob = TextBlob(text)
+        polarity = blob.sentiment.polarity
+        subjectivity = blob.sentiment.subjectivity
+        phishing_indicators = {
+            "urgent_words": bool(re.search(r'\b(urgent|immediate|act now|verify|confirm|update|click|verify account)\b', text, re.IGNORECASE)),
+            "threat_words": bool(re.search(r'\b(suspend|limited|expire|locked|disabled|restricted)\b', text, re.IGNORECASE)),
+            "suspicious_urls": bool(re.search(r'http\S+|www\S+', text)),
+            "urgency_level": "HIGH" if re.search(r'\b(urgent|immediate|act now)\b', text, re.IGNORECASE) else "LOW"
+        }
+        return {
+            "polarity": round(polarity, 4),
+            "subjectivity": round(subjectivity, 4),
+            "sentiment": "positive" if polarity > 0.1 else "negative" if polarity < -0.1 else "neutral",
+            "is_persuasive": subjectivity > 0.5,
+            "phishing_indicators": phishing_indicators
+        }
+    def preprocess(self, text: str) -> Dict:
+        """Preprocessing for analysis"""
+        tokens = self.tokenize(text)
+        tokens_no_stop = self.remove_stopwords(tokens)
+        stemmed = self.stem(tokens_no_stop)
+        lemmatized = self.lemmatize(tokens_no_stop)
+        sentiment = self.sentiment_analysis(text)
+        return {
+            "original_text": text,
+            "tokens": tokens,
+            "tokens_without_stopwords": tokens_no_stop,
+            "stemmed_tokens": stemmed,
+            "lemmatized_tokens": lemmatized,
+            "sentiment": sentiment,
+            "token_count": len(tokens_no_stop)
+        }
+# ============================================================================
+# PYDANTIC MODELS
+# ============================================================================
 class PredictPayload(BaseModel):
     inputs: str
+    include_preprocessing: bool = True
 class BatchPredictPayload(BaseModel):
     inputs: List[str]
+    include_preprocessing: bool = True
 class LabeledText(BaseModel):
     text: str
+    label: Optional[str] = None
 class EvalPayload(BaseModel):
     samples: List[LabeledText]
+# ============================================================================
+# GLOBAL VARIABLES
+# ============================================================================
 _tokenizer = None
 _model = None
 _device = "cpu"
+_preprocessor = None
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+def _normalize_label(txt: str) -> str:
+    """Normalize label text"""
     t = (str(txt) if txt is not None else "").strip().upper()
+    if t in ("PHISHING", "PHISH", "SPAM", "1"):
         return "PHISH"
+    if t in ("LEGIT", "LEGITIMATE", "SAFE", "HAM", "0"):
         return "LEGIT"
     return t
 def _load_model():
+    """Load model, tokenizer, and preprocessor"""
+    global _tokenizer, _model, _device, _preprocessor
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"\n{'='*60}")
+        print(f"Loading model: {MODEL_ID}")
+        print(f"Device: {_device}")
+        print(f"Temperature scaling: {TEMPERATURE}")
+        print(f"{'='*60}\n")
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
         _model.to(_device)
+        _model.eval()
+        _preprocessor = TextPreprocessor()
+        # Warm-up
         with torch.no_grad():
             _ = _model(
                 **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512)
                 .to(_device)
             ).logits
+        id2label = getattr(_model.config, "id2label", {})
+        print(f"Model labels: {id2label}")
+        print(f"{'='*60}\n")
+def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
+    """Predict with temperature-scaled probabilities"""
     _load_model()
     if not texts:
         return []
+    # Get preprocessing info
+    preprocessing_info = None
+    if include_preprocessing:
+        preprocessing_info = [_preprocessor.preprocess(text) for text in texts]
+    # Tokenize
     enc = _tokenizer(
         texts,
         return_tensors="pt",
     )
     enc = {k: v.to(_device) for k, v in enc.items()}
+    # Predict with temperature scaling
     with torch.no_grad():
         logits = _model(**enc).logits
+        # Apply temperature scaling to soften probabilities
+        scaled_logits = logits / TEMPERATURE
+        probs = F.softmax(scaled_logits, dim=-1)
+    # Get labels from model config
+    id2label = getattr(_model.config, "id2label", {0: "LEGIT", 1: "PHISH"})
     outputs: List[Dict] = []
+    for text_idx in range(probs.shape[0]):
+        p = probs[text_idx]
+        # Get prediction
+        predicted_idx = int(torch.argmax(p).item())
+        predicted_label_raw = id2label.get(predicted_idx, f"CLASS_{predicted_idx}")
+        predicted_label_norm = _normalize_label(predicted_label_raw)
+        predicted_prob = float(p[predicted_idx].item())
+        # Build probability breakdown
+        prob_breakdown = {}
+        for i in range(len(p)):
+            label = _normalize_label(id2label.get(i, f"CLASS_{i}"))
+            prob_breakdown[label] = round(float(p[i].item()), 4)
+        output = {
+            "text": texts[text_idx][:100] + "..." if len(texts[text_idx]) > 100 else texts[text_idx],
+            "label": predicted_label_norm,
+            "raw_label": predicted_label_raw,
+            "is_phish": predicted_label_norm == "PHISH",
+            "confidence": round(predicted_prob * 100, 2),
+            "score": round(predicted_prob, 4),
+            "probs": prob_breakdown,
+        }
+        if include_preprocessing and preprocessing_info:
+            output["preprocessing"] = preprocessing_info[text_idx]
+        outputs.append(output)
     return outputs
+# ============================================================================
+# API ENDPOINTS
+# ============================================================================
 @app.get("/")
 def root():
+    """Root endpoint"""
     _load_model()
     return {
         "status": "ok",
         "model": MODEL_ID,
+        "device": _device,
+        "temperature": TEMPERATURE,
+        "note": "Using temperature scaling to calibrate probabilities"
     }
 @app.get("/debug/labels")
 def debug_labels():
+    """View model configuration"""
     _load_model()
     return {
+        "status": "ok",
+        "model_id": MODEL_ID,
         "id2label": getattr(_model.config, "id2label", {}),
         "label2id": getattr(_model.config, "label2id", {}),
         "num_labels": int(getattr(_model.config, "num_labels", 0)),
         "device": _device,
+        "temperature": TEMPERATURE,
     }
+@app.post("/debug/preprocessing")
+def debug_preprocessing(payload: PredictPayload):
+    """Debug preprocessing"""
+    try:
+        _load_model()
+        preprocessing = _preprocessor.preprocess(payload.inputs)
+        return preprocessing
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/predict")
 def predict(payload: PredictPayload):
+    """Single prediction"""
     try:
+        res = _predict_texts([payload.inputs], include_preprocessing=payload.include_preprocessing)
         return res[0]
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/predict-batch")
 def predict_batch(payload: BatchPredictPayload):
+    """Batch predictions"""
     try:
+        return _predict_texts(payload.inputs, include_preprocessing=payload.include_preprocessing)
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/evaluate")
 def evaluate(payload: EvalPayload):
+    """Evaluate on labeled samples"""
     try:
         texts = [s.text for s in payload.samples]
+        gts = [(_normalize_label(s.label) if s.label is not None else None) for s in payload.samples]
+        preds = _predict_texts(texts, include_preprocessing=False)
         total = len(preds)
         correct = 0
         acc = (correct / sum(1 for gt in gts if gt is not None)) if has_gts else None
         return {
+            "accuracy": round(acc, 4) if acc else None,
             "total": total,
+            "correct": correct,
             "predictions": preds,
             "per_class": per_class,
         }
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)