Spaces:

Gaykar
/

ClassifyEmail

Sleeping

App Files Files Community

Gaykar commited on Dec 24, 2025

Commit

6da1ce8

1 Parent(s): c69597c

added models

Browse files

Files changed (1) hide show

preditormodels.py +63 -0

preditormodels.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
+import torch
+from config import URL_FEATURES ,device
+import numpy as np
+from pipeline import EmailFeatureExtractor
+import joblib
+import xgboost as xgb
+class PhishingPredictor:
+    def __init__(self, bert_path: str, xgb_path: str):
+        print("[INFO] Initializing Models...")
+        self.device = device
+        # 1. Load BERT components
+        self.tokenizer = DistilBertTokenizerFast.from_pretrained(bert_path)
+        self.bert_model = DistilBertForSequenceClassification.from_pretrained(bert_path)
+        self.bert_model.to(self.device)
+        self.bert_model.eval()
+        # 2. Load XGBoost Classifier
+        # Use load_model for .json/.model or joblib for .pkl
+        self.xgb_model = xgb.XGBClassifier()
+        self.xgb_model.load_model(xgb_path)
+        # 3. Initialize your Feature Extractor
+        self.extractor = EmailFeatureExtractor()
+    def get_cls_embedding(self, text: str) -> np.ndarray:
+        """Generates 768-dim CLS embedding from fine-tuned BERT."""
+        with torch.no_grad():
+            inputs = self.tokenizer(
+                text, return_tensors="pt", truncation=True, padding=True, max_length=256
+            ).to(self.device)
+            outputs = self.bert_model.distilbert(**inputs)
+            # Take CLS token embedding
+            return outputs.last_hidden_state[:, 0, :].cpu().numpy()
+    def predict(self, subject: str, body: str):
+        # Step 1: Extract all features using your pipeline
+        processed_df = self.extractor.transform(subject, body)
+        # Step 2: Get BERT Embeddings for text_combined
+        bert_emb = self.get_cls_embedding(processed_df['text_combined'].iloc[0])
+        # Step 3: Get Numerical features (the 19 URL features)
+        url_feats = processed_df[URL_FEATURES].to_numpy(dtype=np.float32)
+        # Step 4: Concatenate [BERT (768) + URL (19)] = 787 Features
+        final_input = np.concatenate([bert_emb, url_feats], axis=1)
+        prob = self.xgb_model.predict_proba(final_input)[0][1]
+        prediction = "PHISHING" if prob > 0.5 else "SAFE"
+        return {
+            "prediction": prediction,
+            "confidence": f"{prob*100:.2f}%",
+            "url_count": int(processed_df['URL_COUNT'].iloc[0])
+        }