Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 8, 2025

Commit

ad3f1d2

verified ·

1 Parent(s): b418015

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -194

app.py CHANGED Viewed

@@ -1,184 +1,98 @@
 import os
 from typing import List, Optional, Dict
-import re
 import torch
-import nltk
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from nltk.corpus import stopwords
-from nltk.stem import PorterStemmer, WordNetLemmatizer
-from nltk.tokenize import word_tokenize
-from textblob import TextBlob
-# Download NLTK data
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt')
-    nltk.download('stopwords')
-    nltk.download('wordnet')
-# ✅ CHANGE THIS TO POINT TO YOUR MODEL REPOSITORY
-MODEL_ID = "Perth0603/phishing-email-mobilebert"  # ← Your model storage repo
-app = FastAPI(title="Phishing Text Classifier with Preprocessing", version="1.0.0")
-# ============================================================================
-# TEXT PREPROCESSING CLASS
-# ============================================================================
-class TextPreprocessor:
-    """NLP preprocessing for analysis and feature extraction"""
-    def __init__(self):
-        self.stemmer = PorterStemmer()
-        self.lemmatizer = WordNetLemmatizer()
-        self.stop_words = set(stopwords.words('english'))
-    def tokenize(self, text: str) -> List[str]:
-        """Break text into tokens"""
-        return word_tokenize(text.lower())
-    def remove_stopwords(self, tokens: List[str]) -> List[str]:
-        """Remove common stop words"""
-        return [token for token in tokens if token.isalnum() and token not in self.stop_words]
-    def stem(self, tokens: List[str]) -> List[str]:
-        """Reduce tokens to stems"""
-        return [self.stemmer.stem(token) for token in tokens]
-    def lemmatize(self, tokens: List[str]) -> List[str]:
-        """Reduce tokens to lemmas"""
-        return [self.lemmatizer.lemmatize(token) for token in tokens]
-    def sentiment_analysis(self, text: str) -> Dict:
-        """Analyze sentiment and phishing indicators"""
-        blob = TextBlob(text)
-        polarity = blob.sentiment.polarity
-        subjectivity = blob.sentiment.subjectivity
-        phishing_indicators = {
-            "urgent_words": bool(re.search(r'\b(urgent|immediate|act now|verify|confirm|update|click|verify account)\b', text, re.IGNORECASE)),
-            "threat_words": bool(re.search(r'\b(suspend|limited|expire|locked|disabled|restricted)\b', text, re.IGNORECASE)),
-            "suspicious_urls": bool(re.search(r'http\S+|www\S+', text)),
-            "urgency_level": "HIGH" if re.search(r'\b(urgent|immediate|act now)\b', text, re.IGNORECASE) else "LOW"
-        }
-        return {
-            "polarity": round(polarity, 4),
-            "subjectivity": round(subjectivity, 4),
-            "sentiment": "positive" if polarity > 0.1 else "negative" if polarity < -0.1 else "neutral",
-            "is_persuasive": subjectivity > 0.5,
-            "phishing_indicators": phishing_indicators
-        }
-    def preprocess(self, text: str) -> Dict:
-        """Preprocessing for analysis"""
-        tokens = self.tokenize(text)
-        tokens_no_stop = self.remove_stopwords(tokens)
-        stemmed = self.stem(tokens_no_stop)
-        lemmatized = self.lemmatize(tokens_no_stop)
-        sentiment = self.sentiment_analysis(text)
-        return {
-            "original_text": text,
-            "tokens": tokens,
-            "tokens_without_stopwords": tokens_no_stop,
-            "stemmed_tokens": stemmed,
-            "lemmatized_tokens": lemmatized,
-            "sentiment": sentiment,
-            "token_count": len(tokens_no_stop)
-        }
-# ============================================================================
-# PYDANTIC MODELS
-# ============================================================================
 class PredictPayload(BaseModel):
     inputs: str
-    include_preprocessing: bool = True
 class BatchPredictPayload(BaseModel):
     inputs: List[str]
-    include_preprocessing: bool = True
 class LabeledText(BaseModel):
     text: str
-    label: Optional[str] = None
 class EvalPayload(BaseModel):
     samples: List[LabeledText]
-# ============================================================================
-# GLOBAL VARIABLES
-# ============================================================================
 _tokenizer = None
 _model = None
 _device = "cpu"
-_preprocessor = None
-# ============================================================================
-# HELPER FUNCTIONS
-# ============================================================================
-def _normalize_label(txt: str) -> str:
-    """Normalize label text"""
     t = (str(txt) if txt is not None else "").strip().upper()
-    if t in ("PHISHING", "PHISH", "SPAM", "1"):
         return "PHISH"
-    if t in ("LEGIT", "LEGITIMATE", "SAFE", "HAM", "0"):
         return "LEGIT"
     return t
 def _load_model():
-    """Load model, tokenizer, and preprocessor"""
-    global _tokenizer, _model, _device, _preprocessor
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"\n{'='*60}")
-        print(f"Loading model: {MODEL_ID}")
-        print(f"Device: {_device}")
-        print(f"{'='*60}\n")
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
         _model.to(_device)
-        _model.eval()
-        _preprocessor = TextPreprocessor()
-        # Warm-up
         with torch.no_grad():
             _ = _model(
                 **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512)
                 .to(_device)
             ).logits
-        # Check label mapping
-        id2label = getattr(_model.config, "id2label", {})
-        print(f"Model labels: {id2label}")
-        print(f"{'='*60}\n")
-def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
-    """Predict with correct label mapping"""
     _load_model()
     if not texts:
         return []
-    # Get preprocessing info
-    preprocessing_info = None
-    if include_preprocessing:
-        preprocessing_info = [_preprocessor.preprocess(text) for text in texts]
-    # Tokenize
     enc = _tokenizer(
         texts,
         return_tensors="pt",
@@ -188,115 +102,95 @@ def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List
     )
     enc = {k: v.to(_device) for k, v in enc.items()}
-    # Predict
     with torch.no_grad():
         logits = _model(**enc).logits
-        probs = torch.softmax(logits, dim=-1)
-    # Get labels from model config
-    id2label = getattr(_model.config, "id2label", {0: "LEGIT", 1: "PHISH"})
     outputs: List[Dict] = []
-    for text_idx in range(probs.shape[0]):
-        p = probs[text_idx]
-        # Get prediction
-        predicted_idx = int(torch.argmax(p).item())
-        predicted_label_raw = id2label.get(predicted_idx, f"CLASS_{predicted_idx}")
-        predicted_label_norm = _normalize_label(predicted_label_raw)
-        predicted_prob = float(p[predicted_idx].item())
-        # Build probability breakdown
-        prob_breakdown = {}
-        for i in range(len(p)):
-            label = _normalize_label(id2label.get(i, f"CLASS_{i}"))
-            prob_breakdown[label] = round(float(p[i].item()), 4)
-        output = {
-            "text": texts[text_idx][:100] + "..." if len(texts[text_idx]) > 100 else texts[text_idx],
-            "label": predicted_label_norm,
-            "raw_label": predicted_label_raw,
-            "is_phish": predicted_label_norm == "PHISH",
-            "confidence": round(predicted_prob * 100, 2),
-            "score": round(predicted_prob, 4),
-            "probs": prob_breakdown,
-        }
-        if include_preprocessing and preprocessing_info:
-            output["preprocessing"] = preprocessing_info[text_idx]
-        outputs.append(output)
     return outputs
-# ============================================================================
-# API ENDPOINTS
-# ============================================================================
 @app.get("/")
 def root():
-    """Root endpoint"""
     _load_model()
     return {
         "status": "ok",
         "model": MODEL_ID,
-        "device": _device,
     }
 @app.get("/debug/labels")
 def debug_labels():
-    """View model configuration"""
     _load_model()
     return {
-        "status": "ok",
-        "model_id": MODEL_ID,
         "id2label": getattr(_model.config, "id2label", {}),
         "label2id": getattr(_model.config, "label2id", {}),
         "num_labels": int(getattr(_model.config, "num_labels", 0)),
         "device": _device,
     }
-@app.post("/debug/preprocessing")
-def debug_preprocessing(payload: PredictPayload):
-    """Debug preprocessing"""
-    try:
-        _load_model()
-        preprocessing = _preprocessor.preprocess(payload.inputs)
-        return preprocessing
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/predict")
 def predict(payload: PredictPayload):
-    """Single prediction"""
     try:
-        res = _predict_texts([payload.inputs], include_preprocessing=payload.include_preprocessing)
         return res[0]
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/predict-batch")
 def predict_batch(payload: BatchPredictPayload):
-    """Batch predictions"""
     try:
-        return _predict_texts(payload.inputs, include_preprocessing=payload.include_preprocessing)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/evaluate")
 def evaluate(payload: EvalPayload):
-    """Evaluate on labeled samples"""
     try:
         texts = [s.text for s in payload.samples]
-        gts = [(_normalize_label(s.label) if s.label is not None else None) for s in payload.samples]
-        preds = _predict_texts(texts, include_preprocessing=False)
         total = len(preds)
         correct = 0
@@ -315,16 +209,16 @@ def evaluate(payload: EvalPayload):
         acc = (correct / sum(1 for gt in gts if gt is not None)) if has_gts else None
         return {
-            "accuracy": round(acc, 4) if acc else None,
             "total": total,
-            "correct": correct,
             "predictions": preds,
             "per_class": per_class,
         }
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import os
 from typing import List, Optional, Dict
 import torch
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# Prefer MODEL_ID, fall back to HF_MODEL_ID, then default
+MODEL_ID = (
+    os.environ.get("MODEL_ID")
+    or os.environ.get("HF_MODEL_ID")
+    or "Perth0603/phishing-email-mobilebert"
+)
+app = FastAPI(title="Phishing Text Classifier (Model-Authoritative)", version="1.0.0")
 class PredictPayload(BaseModel):
     inputs: str
 class BatchPredictPayload(BaseModel):
     inputs: List[str]
 class LabeledText(BaseModel):
     text: str
+    label: Optional[str] = None  # optional ground truth for quick eval (accepts text)
 class EvalPayload(BaseModel):
     samples: List[LabeledText]
 _tokenizer = None
 _model = None
 _device = "cpu"
+# Cached normalized mapping/meta
+_NORM_LABELS_BY_IDX = None  # normalized labels ordered by model indices
+def _normalize_label_text_only(txt: str) -> str:
+    """
+    Normalize model label text to PHISH/LEGIT when possible.
+    If unfamiliar, return the uppercased original token.
+    """
     t = (str(txt) if txt is not None else "").strip().upper()
+    if t in ("PHISHING", "PHISH", "SPAM"):
         return "PHISH"
+    if t in ("LEGIT", "LEGITIMATE", "SAFE", "HAM"):
         return "LEGIT"
+    # keep other label names as-is (uppercased) so we don't force an incorrect mapping
     return t
 def _load_model():
+    global _tokenizer, _model, _device, _NORM_LABELS_BY_IDX
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
         _model.to(_device)
+        _model.eval()  # important: disable dropout etc.
+        # Warm-up (silent)
         with torch.no_grad():
             _ = _model(
                 **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512)
                 .to(_device)
             ).logits
+        # Read and normalize model labels (by index)
+        id2label = getattr(_model.config, "id2label", {}) or {}
+        num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
+        _NORM_LABELS_BY_IDX = [_normalize_label_text_only(id2label.get(i, f"LABEL_{i}")) for i in range(num_labels)]
+def _predict_texts(texts: List[str]) -> List[Dict]:
+    """
+    Predict and return strictly model-authoritative outputs:
+    - label: normalized model label (PHISH/LEGIT or other model label uppercased)
+    - raw_label: original id2label string from model.config
+    - is_phish: boolean derived from normalized label (True if normalized == "PHISH")
+    - score: probability of predicted class
+    - probs: dict of normalized label -> probability (or CLASS_i keys if unknown)
+    - predicted_index: argmax index
+    """
     _load_model()
     if not texts:
         return []
+    # Tokenize batch
     enc = _tokenizer(
         texts,
         return_tensors="pt",
     )
     enc = {k: v.to(_device) for k, v in enc.items()}
     with torch.no_grad():
         logits = _model(**enc).logits
+        probs = torch.softmax(logits, dim=-1)  # [batch, num_labels]
+    # Use the model’s own mapping
+    id2label = getattr(_model.config, "id2label", None) or {}
+    labels_by_idx_raw = [id2label.get(i, f"LABEL_{i}") for i in range(probs.shape[-1])]
+    # normalized labels where possible
+    labels_by_idx_norm = [_normalize_label_text_only(lbl) for lbl in labels_by_idx_raw]
     outputs: List[Dict] = []
+    for i in range(probs.shape[0]):
+        p = probs[i]
+        idx = int(torch.argmax(p).item())
+        raw_label = labels_by_idx_raw[idx]
+        norm_label = labels_by_idx_norm[idx]  # normalized where possible
+        # Build probability map keyed by normalized labels when available,
+        # otherwise fallback to CLASS_i keys to avoid collision
+        prob_map: Dict[str, float] = {}
+        for j, lbl_norm in enumerate(labels_by_idx_norm):
+            key = lbl_norm if lbl_norm in ("PHISH", "LEGIT") else f"CLASS_{j}"
+            prob_map[key] = float(p[j].item())
+        outputs.append(
+            {
+                "label": norm_label,                    # authoritative label (model-driven, normalized)
+                "raw_label": raw_label,                 # original model id2label value
+                "is_phish": True if norm_label == "PHISH" else False,
+                "score": float(p[idx].item()),          # probability of predicted class
+                "probs": prob_map,                      # per-class probabilities (keys normalized or CLASS_i)
+                "predicted_index": idx,
+            }
+        )
     return outputs
 @app.get("/")
 def root():
     _load_model()
     return {
         "status": "ok",
         "model": MODEL_ID,
+        "note": "This service returns predictions exactly as the model decides (label derived from model.config.id2label). Frontend should use `label` or `is_phish` as authority."
     }
 @app.get("/debug/labels")
 def debug_labels():
     _load_model()
     return {
         "id2label": getattr(_model.config, "id2label", {}),
         "label2id": getattr(_model.config, "label2id", {}),
         "num_labels": int(getattr(_model.config, "num_labels", 0)),
         "device": _device,
+        "norm_labels_by_idx": _NORM_LABELS_BY_IDX,
     }
 @app.post("/predict")
 def predict(payload: PredictPayload):
     try:
+        res = _predict_texts([payload.inputs])
         return res[0]
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
 @app.post("/predict-batch")
 def predict_batch(payload: BatchPredictPayload):
     try:
+        return _predict_texts(payload.inputs)
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Batch prediction error: {e}")
 @app.post("/evaluate")
 def evaluate(payload: EvalPayload):
+    """
+    Quick on-the-spot test with provided labeled samples.
+    The provided labels are interpreted as text labels (PHISH/LEGIT/etc.) — evaluation is done
+    by comparing normalized GT text to model's normalized prediction (no 0/1 dataset mapping applied).
+    """
     try:
         texts = [s.text for s in payload.samples]
+        gts = [(_normalize_label_text_only(s.label) if s.label is not None else None) for s in payload.samples]
+        preds = _predict_texts(texts)
         total = len(preds)
         correct = 0
         acc = (correct / sum(1 for gt in gts if gt is not None)) if has_gts else None
         return {
+            "accuracy": acc,
             "total": total,
             "predictions": preds,
             "per_class": per_class,
         }
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Evaluation error: {e}")
 if __name__ == "__main__":
+    # Run:  uvicorn app:app --host 0.0.0.0 --port 8000 --reload
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)