Perth0603 commited on
Commit
7769849
·
verified ·
1 Parent(s): ca7522d

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +28 -0
  2. README.md +41 -12
  3. app.py +131 -0
  4. requirements.txt +13 -0
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1
6
+
7
+ WORKDIR /app
8
+
9
+ # Writable cache directory for HF/torch
10
+ RUN mkdir -p /data/.cache && chmod -R 777 /data
11
+ ENV HF_HOME=/data/.cache \
12
+ TRANSFORMERS_CACHE=/data/.cache \
13
+ TORCH_HOME=/data/.cache
14
+
15
+ # System deps (optional but helps with torch wheels)
16
+ RUN apt-get update && apt-get install -y --no-install-recommends \
17
+ build-essential git && \
18
+ rm -rf /var/lib/apt/lists/*
19
+
20
+ COPY requirements.txt /app/requirements.txt
21
+ RUN pip install -r /app/requirements.txt
22
+
23
+ COPY app.py /app/app.py
24
+
25
+ EXPOSE 7860
26
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
27
+
28
+
README.md CHANGED
@@ -1,12 +1,41 @@
1
- ---
2
- title: Random Forest Model For PhishingDetection
3
- emoji: 🌍
4
- colorFrom: yellow
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.47.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PhishWatch Proxy
3
+ emoji: 🛡️
4
+ sdk: docker
5
+ ---
6
+
7
+ # Hugging Face Space - Phishing Text Classifier (Docker + FastAPI)
8
+
9
+ This Space exposes two endpoints so the Flutter app can call them reliably:
10
+
11
+ - `/predict` for text/email/SMS classification via Transformers
12
+ - `/predict-url` for URL classification via your scikit-learn Random Forest model
13
+
14
+ ## Files
15
+ - Dockerfile - builds a small FastAPI server image
16
+ - app.py - FastAPI app that loads the model and returns `{ label, score }`.
17
+ - requirements.txt - Python dependencies.
18
+
19
+ ## How to deploy
20
+ 1. Create a new Space on Hugging Face (type: Docker).
21
+ 2. Upload the contents of this `hf_space/` folder to the Space root (including Dockerfile).
22
+ 3. In Space Settings → Variables, add:
23
+ - MODEL_ID = Perth0603/phishing-email-mobilebert
24
+ - URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
25
+ - URL_FILENAME = url_rf_model.joblib (set to your artifact filename)
26
+ 4. Wait for the Space to build and become green. Test:
27
+ - GET `/` should return `{ status: ok, model: ... }`
28
+ - POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
29
+ - POST `/predict-url` with `{ "url": "https://example.com/login" }`
30
+
31
+ ## Flutter app config
32
+ Set the Space URL in your env file so the app targets the Space instead of the Hosted Inference API:
33
+
34
+ ```
35
+ {"HF_SPACE_URL":"https://<your-space>.hf.space"}
36
+ ```
37
+
38
+ Run the app:
39
+ ```
40
+ flutter run --dart-define-from-file=hf.env.json
41
+ ```
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ.setdefault("HOME", "/data")
3
+ os.environ.setdefault("XDG_CACHE_HOME", "/data/.cache")
4
+ os.environ.setdefault("HF_HOME", "/data/.cache")
5
+ os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
6
+ os.environ.setdefault("TORCH_HOME", "/data/.cache")
7
+
8
+ from fastapi import FastAPI
9
+ from fastapi.responses import JSONResponse
10
+ from pydantic import BaseModel
11
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
+ from huggingface_hub import hf_hub_download
13
+ import joblib
14
+ import torch
15
+
16
+
17
+ MODEL_ID = os.environ.get("MODEL_ID", "Perth0603/phishing-email-mobilebert")
18
+ URL_REPO = os.environ.get("URL_REPO", "Perth0603/Random-Forest-Model-for-PhishingDetection")
19
+ URL_REPO_TYPE = os.environ.get("URL_REPO_TYPE", "model") # model|space|dataset
20
+ URL_FILENAME = os.environ.get("URL_FILENAME", "url_rf_model.joblib")
21
+
22
+ # Ensure writable cache directory for HF/torch inside Spaces Docker
23
+ CACHE_DIR = os.environ.get("HF_CACHE_DIR", "/data/.cache")
24
+ os.makedirs(CACHE_DIR, exist_ok=True)
25
+
26
+ app = FastAPI(title="Phishing Text Classifier", version="1.0.0")
27
+
28
+
29
+ class PredictPayload(BaseModel):
30
+ inputs: str
31
+
32
+
33
+ # Lazy singletons for model/tokenizer
34
+ _tokenizer = None
35
+ _model = None
36
+ _url_model = None
37
+
38
+
39
+ def _load_url_model():
40
+ global _url_model
41
+ if _url_model is None:
42
+ # Prefer local artifact if present (e.g., committed into the Space repo)
43
+ local_path = os.path.join(os.getcwd(), URL_FILENAME)
44
+ if os.path.exists(local_path):
45
+ _url_model = joblib.load(local_path)
46
+ return
47
+ # Download model artifact from HF Hub
48
+ model_path = hf_hub_download(
49
+ repo_id=URL_REPO,
50
+ filename=URL_FILENAME,
51
+ repo_type=URL_REPO_TYPE,
52
+ cache_dir=CACHE_DIR,
53
+ )
54
+ _url_model = joblib.load(model_path)
55
+
56
+
57
+ def _load_model():
58
+ global _tokenizer, _model
59
+ if _tokenizer is None or _model is None:
60
+ _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
61
+ _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
62
+ # Warm-up
63
+ with torch.no_grad():
64
+ _ = _model(**_tokenizer(["warm up"], return_tensors="pt")).logits
65
+
66
+
67
+ @app.get("/")
68
+ def root():
69
+ return {"status": "ok", "model": MODEL_ID}
70
+
71
+
72
+ @app.post("/predict")
73
+ def predict(payload: PredictPayload):
74
+ try:
75
+ _load_model()
76
+ with torch.no_grad():
77
+ inputs = _tokenizer([payload.inputs], return_tensors="pt", truncation=True, max_length=512)
78
+ logits = _model(**inputs).logits
79
+ probs = torch.softmax(logits, dim=-1)[0]
80
+ score, idx = torch.max(probs, dim=0)
81
+ except Exception as e:
82
+ return JSONResponse(status_code=500, content={"error": str(e)})
83
+
84
+ # Map common ids to labels (kept generic; your config also has these)
85
+ id2label = {0: "LEGIT", 1: "PHISH"}
86
+ label = id2label.get(int(idx), str(int(idx)))
87
+ return {"label": label, "score": float(score)}
88
+
89
+
90
+ class PredictUrlPayload(BaseModel):
91
+ url: str
92
+
93
+
94
+ @app.post("/predict-url")
95
+ def predict_url(payload: PredictUrlPayload):
96
+ try:
97
+ _load_url_model()
98
+ # Expect the sklearn pipeline to accept raw URL string and output either
99
+ # probabilities via predict_proba or binary via predict
100
+ model = _url_model
101
+ score = None
102
+ label = None
103
+ if hasattr(model, "predict_proba"):
104
+ proba = model.predict_proba([payload.url])[0]
105
+ # Assume index 1 corresponds to PHISH
106
+ if len(proba) == 2:
107
+ score = float(proba[1])
108
+ label = "PHISH" if score >= 0.5 else "LEGIT"
109
+ else:
110
+ # Multiclass fallback: take max
111
+ max_idx = int(proba.argmax())
112
+ score = float(proba[max_idx])
113
+ label = "PHISH" if max_idx == 1 else "LEGIT"
114
+ else:
115
+ pred = model.predict([payload.url])[0]
116
+ # Common encodings: 1=phish, 0=legit or strings
117
+ if isinstance(pred, (int, float)):
118
+ label = "PHISH" if int(pred) == 1 else "LEGIT"
119
+ score = 1.0 if label == "PHISH" else 0.0
120
+ else:
121
+ up = str(pred).strip().upper()
122
+ if up in ("PHISH", "PHISHING", "MALICIOUS"):
123
+ label, score = "PHISH", 1.0
124
+ else:
125
+ label, score = "LEGIT", 0.0
126
+ except Exception as e:
127
+ return JSONResponse(status_code=500, content={"error": str(e)})
128
+
129
+ return {"label": label, "score": float(score)}
130
+
131
+
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cpu
2
+ fastapi==0.115.0
3
+ uvicorn==0.30.6
4
+ transformers==4.46.3
5
+ torch==2.3.1+cpu
6
+ accelerate>=0.33.0
7
+ safetensors>=0.4.3
8
+
9
+ # URL model dependencies
10
+ huggingface_hub>=0.23.0
11
+ scikit-learn>=1.3.0
12
+ joblib>=1.3.0
13
+