Spaces:

Perth0603
/

Random-Forest-Model-for-PhishingDetection

Sleeping

App Files Files Community

Perth0603 commited on Oct 1, 2025

Commit

7769849

verified ·

1 Parent(s): ca7522d

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +28 -0
README.md +41 -12
app.py +131 -0
requirements.txt +13 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1
+WORKDIR /app
+# Writable cache directory for HF/torch
+RUN mkdir -p /data/.cache && chmod -R 777 /data
+ENV HF_HOME=/data/.cache \
+    TRANSFORMERS_CACHE=/data/.cache \
+    TORCH_HOME=/data/.cache
+# System deps (optional but helps with torch wheels)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential git && \
+    rm -rf /var/lib/apt/lists/*
+COPY requirements.txt /app/requirements.txt
+RUN pip install -r /app/requirements.txt
+COPY app.py /app/app.py
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,41 @@
----
-title: Random Forest Model For PhishingDetection
-emoji: 🌍
-colorFrom: yellow
-colorTo: red
-sdk: gradio
-sdk_version: 5.47.2
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: PhishWatch Proxy
+emoji: 🛡️
+sdk: docker
+---
+# Hugging Face Space - Phishing Text Classifier (Docker + FastAPI)
+This Space exposes two endpoints so the Flutter app can call them reliably:
+- `/predict` for text/email/SMS classification via Transformers
+- `/predict-url` for URL classification via your scikit-learn Random Forest model
+## Files
+- Dockerfile - builds a small FastAPI server image
+- app.py - FastAPI app that loads the model and returns `{ label, score }`.
+- requirements.txt - Python dependencies.
+## How to deploy
+1. Create a new Space on Hugging Face (type: Docker).
+2. Upload the contents of this `hf_space/` folder to the Space root (including Dockerfile).
+3. In Space Settings → Variables, add:
+   - MODEL_ID = Perth0603/phishing-email-mobilebert
+   - URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
+   - URL_FILENAME = url_rf_model.joblib  (set to your artifact filename)
+4. Wait for the Space to build and become green. Test:
+   - GET `/` should return `{ status: ok, model: ... }`
+   - POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
+   - POST `/predict-url` with `{ "url": "https://example.com/login" }`
+## Flutter app config
+Set the Space URL in your env file so the app targets the Space instead of the Hosted Inference API:
+```
+{"HF_SPACE_URL":"https://<your-space>.hf.space"}
+```
+Run the app:
+```
+flutter run --dart-define-from-file=hf.env.json
+```

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+os.environ.setdefault("HOME", "/data")
+os.environ.setdefault("XDG_CACHE_HOME", "/data/.cache")
+os.environ.setdefault("HF_HOME", "/data/.cache")
+os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
+os.environ.setdefault("TORCH_HOME", "/data/.cache")
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from huggingface_hub import hf_hub_download
+import joblib
+import torch
+MODEL_ID = os.environ.get("MODEL_ID", "Perth0603/phishing-email-mobilebert")
+URL_REPO = os.environ.get("URL_REPO", "Perth0603/Random-Forest-Model-for-PhishingDetection")
+URL_REPO_TYPE = os.environ.get("URL_REPO_TYPE", "model")  # model|space|dataset
+URL_FILENAME = os.environ.get("URL_FILENAME", "url_rf_model.joblib")
+# Ensure writable cache directory for HF/torch inside Spaces Docker
+CACHE_DIR = os.environ.get("HF_CACHE_DIR", "/data/.cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+app = FastAPI(title="Phishing Text Classifier", version="1.0.0")
+class PredictPayload(BaseModel):
+    inputs: str
+# Lazy singletons for model/tokenizer
+_tokenizer = None
+_model = None
+_url_model = None
+def _load_url_model():
+    global _url_model
+    if _url_model is None:
+        # Prefer local artifact if present (e.g., committed into the Space repo)
+        local_path = os.path.join(os.getcwd(), URL_FILENAME)
+        if os.path.exists(local_path):
+            _url_model = joblib.load(local_path)
+            return
+        # Download model artifact from HF Hub
+        model_path = hf_hub_download(
+            repo_id=URL_REPO,
+            filename=URL_FILENAME,
+            repo_type=URL_REPO_TYPE,
+            cache_dir=CACHE_DIR,
+        )
+        _url_model = joblib.load(model_path)
+def _load_model():
+    global _tokenizer, _model
+    if _tokenizer is None or _model is None:
+        _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
+        _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
+        # Warm-up
+        with torch.no_grad():
+            _ = _model(**_tokenizer(["warm up"], return_tensors="pt")).logits
+@app.get("/")
+def root():
+    return {"status": "ok", "model": MODEL_ID}
+@app.post("/predict")
+def predict(payload: PredictPayload):
+    try:
+        _load_model()
+        with torch.no_grad():
+            inputs = _tokenizer([payload.inputs], return_tensors="pt", truncation=True, max_length=512)
+            logits = _model(**inputs).logits
+            probs = torch.softmax(logits, dim=-1)[0]
+            score, idx = torch.max(probs, dim=0)
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})
+    # Map common ids to labels (kept generic; your config also has these)
+    id2label = {0: "LEGIT", 1: "PHISH"}
+    label = id2label.get(int(idx), str(int(idx)))
+    return {"label": label, "score": float(score)}
+class PredictUrlPayload(BaseModel):
+    url: str
+@app.post("/predict-url")
+def predict_url(payload: PredictUrlPayload):
+    try:
+        _load_url_model()
+        # Expect the sklearn pipeline to accept raw URL string and output either
+        # probabilities via predict_proba or binary via predict
+        model = _url_model
+        score = None
+        label = None
+        if hasattr(model, "predict_proba"):
+            proba = model.predict_proba([payload.url])[0]
+            # Assume index 1 corresponds to PHISH
+            if len(proba) == 2:
+                score = float(proba[1])
+                label = "PHISH" if score >= 0.5 else "LEGIT"
+            else:
+                # Multiclass fallback: take max
+                max_idx = int(proba.argmax())
+                score = float(proba[max_idx])
+                label = "PHISH" if max_idx == 1 else "LEGIT"
+        else:
+            pred = model.predict([payload.url])[0]
+            # Common encodings: 1=phish, 0=legit or strings
+            if isinstance(pred, (int, float)):
+                label = "PHISH" if int(pred) == 1 else "LEGIT"
+                score = 1.0 if label == "PHISH" else 0.0
+            else:
+                up = str(pred).strip().upper()
+                if up in ("PHISH", "PHISHING", "MALICIOUS"):
+                    label, score = "PHISH", 1.0
+                else:
+                    label, score = "LEGIT", 0.0
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})
+    return {"label": label, "score": float(score)}

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+--extra-index-url https://download.pytorch.org/whl/cpu
+fastapi==0.115.0
+uvicorn==0.30.6
+transformers==4.46.3
+torch==2.3.1+cpu
+accelerate>=0.33.0
+safetensors>=0.4.3
+# URL model dependencies
+huggingface_hub>=0.23.0
+scikit-learn>=1.3.0
+joblib>=1.3.0