Rasel Santillan commited on
Commit
7a3576b
·
1 Parent(s): f3f638f

Add application file

Browse files
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
__pycache__/app.cpython-312.pyc ADDED
Binary file (5.89 kB). View file
 
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application for phishing URL detection.
3
+ Provides a REST API endpoint to predict if a URL is phishing or legitimate.
4
+ """
5
+
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel, Field, validator
9
+ from typing import Optional
10
+ import uvicorn
11
+
12
+ from model.model import load_model, predict_url
13
+
14
+ # Initialize FastAPI app
15
+ app = FastAPI(
16
+ title="Phishing URL Detection API",
17
+ description="API for detecting phishing URLs using machine learning",
18
+ version="1.0.0"
19
+ )
20
+
21
+ # Add CORS middleware to allow web access
22
+ app.add_middleware(
23
+ CORSMiddleware,
24
+ allow_origins=["*"], # In production, replace with specific origins
25
+ allow_credentials=True,
26
+ allow_methods=["*"],
27
+ allow_headers=["*"],
28
+ )
29
+
30
+ # Load model on startup
31
+ model_components = None
32
+
33
+
34
+ @app.on_event("startup")
35
+ async def startup_event():
36
+ """Load the model when the application starts."""
37
+ global model_components
38
+ try:
39
+ model_components = load_model()
40
+ print("✅ Model loaded successfully on startup")
41
+ except Exception as e:
42
+ print(f"❌ Failed to load model on startup: {e}")
43
+ raise
44
+
45
+
46
+ # Request and Response Models
47
+ class URLRequest(BaseModel):
48
+ """Request model for URL prediction."""
49
+ url: str = Field(..., description="The URL to check for phishing", min_length=1)
50
+
51
+ @validator('url')
52
+ def validate_url(cls, v):
53
+ """Validate that URL is not empty after stripping whitespace."""
54
+ if not v.strip():
55
+ raise ValueError('URL cannot be empty')
56
+ return v.strip()
57
+
58
+ class Config:
59
+ schema_extra = {
60
+ "example": {
61
+ "url": "https://www.google.com"
62
+ }
63
+ }
64
+
65
+
66
+ class PredictionResponse(BaseModel):
67
+ """Response model for URL prediction."""
68
+ url: str = Field(..., description="The URL that was analyzed")
69
+ predicted_label: Optional[int] = Field(None, description="0 for legitimate, 1 for phishing, None if error")
70
+ prediction: str = Field(..., description="Human-readable prediction: 'legitimate', 'phishing', 'unknown', or 'error'")
71
+ phish_probability: Optional[float] = Field(None, description="Probability of being phishing (0.0 to 1.0)")
72
+ confidence: Optional[float] = Field(None, description="Confidence percentage of the prediction")
73
+ features_extracted: bool = Field(..., description="Whether features were successfully extracted from the URL")
74
+ error: Optional[str] = Field(None, description="Error message if prediction failed")
75
+
76
+ class Config:
77
+ schema_extra = {
78
+ "example": {
79
+ "url": "https://www.google.com",
80
+ "predicted_label": 0,
81
+ "prediction": "legitimate",
82
+ "phish_probability": 0.0234,
83
+ "confidence": 97.66,
84
+ "features_extracted": True,
85
+ "error": None
86
+ }
87
+ }
88
+
89
+
90
+ # API Endpoints
91
+ @app.get("/")
92
+ async def root():
93
+ """Root endpoint with API information."""
94
+ return {
95
+ "message": "Phishing URL Detection API",
96
+ "version": "1.0.0",
97
+ "endpoints": {
98
+ "/predict": "POST - Predict if a URL is phishing or legitimate",
99
+ "/health": "GET - Check API health status",
100
+ "/docs": "GET - Interactive API documentation"
101
+ }
102
+ }
103
+
104
+
105
+ @app.get("/health")
106
+ async def health_check():
107
+ """Health check endpoint."""
108
+ return {
109
+ "status": "healthy",
110
+ "model_loaded": model_components is not None
111
+ }
112
+
113
+
114
+ @app.post("/predict", response_model=PredictionResponse)
115
+ async def predict(request: URLRequest):
116
+ """
117
+ Predict if a URL is phishing or legitimate.
118
+
119
+ Args:
120
+ request: URLRequest containing the URL to analyze
121
+
122
+ Returns:
123
+ PredictionResponse with prediction results
124
+
125
+ Raises:
126
+ HTTPException: If model is not loaded or prediction fails
127
+ """
128
+ if model_components is None:
129
+ raise HTTPException(
130
+ status_code=503,
131
+ detail="Model not loaded. Please try again later."
132
+ )
133
+
134
+ try:
135
+ # Make prediction
136
+ result = predict_url(request.url, model_components)
137
+
138
+ return PredictionResponse(**result)
139
+
140
+ except Exception as e:
141
+ raise HTTPException(
142
+ status_code=500,
143
+ detail=f"Prediction failed: {str(e)}"
144
+ )
145
+
146
+
147
+ # Run the application
148
+ if __name__ == "__main__":
149
+ uvicorn.run(
150
+ "app:app",
151
+ host="0.0.0.0",
152
+ port=7860,
153
+ reload=True
154
+ )
model/__pycache__/model.cpython-312.pyc ADDED
Binary file (6.89 kB). View file
 
model/model.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model prediction helper module for phishing URL detection.
3
+ Handles model loading, feature extraction, and prediction.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import numpy as np
9
+ import pandas as pd
10
+ import joblib
11
+ import warnings
12
+ warnings.filterwarnings("ignore", message="X does not have valid feature names", category=UserWarning)
13
+
14
+ # Add parent directory to path to import url_feature_extraction module
15
+ sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
16
+ from url_feature_extraction.url_feature_extractor import extract_features
17
+
18
+ # Global variable to cache the loaded model
19
+ _model_cache = None
20
+
21
+
22
+ def load_model(model_path="model/url_stacking_model.joblib"):
23
+ """
24
+ Load the saved stacking model from file.
25
+
26
+ Args:
27
+ model_path (str): Path to the model file relative to the FastAPI app directory
28
+
29
+ Returns:
30
+ dict: Dictionary containing model components:
31
+ - base_models: Dictionary of base models
32
+ - meta_scaler: Scaler for meta features
33
+ - meta_model: Meta model for final prediction
34
+ - feature_names: List of feature names
35
+ - model_names: List of model names
36
+ """
37
+ global _model_cache
38
+
39
+ # Return cached model if already loaded
40
+ if _model_cache is not None:
41
+ return _model_cache
42
+
43
+ # Construct absolute path to model file
44
+ current_dir = os.path.dirname(os.path.abspath(__file__))
45
+ full_model_path = os.path.join(current_dir, "..", model_path)
46
+ full_model_path = os.path.normpath(full_model_path)
47
+
48
+ if not os.path.exists(full_model_path):
49
+ raise FileNotFoundError(f"Model file not found at: {full_model_path}")
50
+
51
+ # Load model
52
+ model_data = joblib.load(full_model_path)
53
+ print(f"✅ Model loaded successfully from: {full_model_path}")
54
+
55
+ _model_cache = {
56
+ "base_models": model_data["base_models"],
57
+ "meta_scaler": model_data["meta_scaler"],
58
+ "meta_model": model_data["meta_model"],
59
+ "feature_names": model_data["feature_names"],
60
+ "model_names": model_data["model_names"]
61
+ }
62
+
63
+ return _model_cache
64
+
65
+
66
+ def predict_url(url: str, model_components: dict = None):
67
+ """
68
+ Make prediction for a given URL.
69
+
70
+ This function:
71
+ 1. Extracts features from the raw URL using url_feature_extractor
72
+ 2. Converts features to the format expected by the model
73
+ 3. Makes prediction using the stacking model
74
+
75
+ Args:
76
+ url (str): Raw URL to predict
77
+ model_components (dict, optional): Pre-loaded model components.
78
+ If None, will load the model.
79
+
80
+ Returns:
81
+ dict: Dictionary containing:
82
+ - url: The input URL
83
+ - predicted_label: 0 (legitimate) or 1 (phishing)
84
+ - prediction: "legitimate" or "phishing"
85
+ - phish_probability: Probability of being phishing (0.0 to 1.0)
86
+ - confidence: Confidence percentage
87
+ - features_extracted: Boolean indicating if features were successfully extracted
88
+ """
89
+ # Load model if not provided
90
+ if model_components is None:
91
+ model_components = load_model()
92
+
93
+ # Extract features from URL
94
+ features_dict = extract_features(url)
95
+
96
+ # Check if feature extraction was successful
97
+ if features_dict.get('has_title') is None:
98
+ # URL was unreachable or feature extraction failed
99
+ return {
100
+ "url": url,
101
+ "predicted_label": None,
102
+ "prediction": "unknown",
103
+ "phish_probability": None,
104
+ "confidence": None,
105
+ "features_extracted": False,
106
+ "error": "Failed to extract features from URL. The URL may be unreachable or invalid."
107
+ }
108
+
109
+ # Make prediction using the features
110
+ try:
111
+ prediction_result = predict_from_features(features_dict, model_components)
112
+
113
+ predicted_label = prediction_result["predicted_label"]
114
+ phish_probability = prediction_result["phish_probability"]
115
+
116
+ # Calculate confidence
117
+ confidence = max(phish_probability, 1 - phish_probability) * 100
118
+
119
+ return {
120
+ "url": url,
121
+ "predicted_label": predicted_label,
122
+ "prediction": "phishing" if predicted_label == 1 else "legitimate",
123
+ "phish_probability": round(phish_probability, 4),
124
+ "confidence": round(confidence, 2),
125
+ "features_extracted": True
126
+ }
127
+ except Exception as e:
128
+ return {
129
+ "url": url,
130
+ "predicted_label": None,
131
+ "prediction": "error",
132
+ "phish_probability": None,
133
+ "confidence": None,
134
+ "features_extracted": True,
135
+ "error": f"Prediction error: {str(e)}"
136
+ }
137
+
138
+
139
+ def predict_from_features(features_dict: dict, model_components: dict):
140
+ """
141
+ Make predictions given a dictionary of extracted features.
142
+
143
+ This function implements the stacking model prediction:
144
+ - Level 0: Base models make predictions
145
+ - Level 1: Meta model combines base model predictions
146
+
147
+ Args:
148
+ features_dict (dict): Dictionary where keys are feature names and values are feature values
149
+ model_components (dict): The loaded components returned by load_model()
150
+
151
+ Returns:
152
+ dict: Contains 'predicted_label' (0 or 1) and 'phish_probability' (float)
153
+ """
154
+ base_models = model_components["base_models"]
155
+ meta_scaler = model_components["meta_scaler"]
156
+ meta_model = model_components["meta_model"]
157
+ feature_names = model_components["feature_names"]
158
+ model_names = model_components["model_names"]
159
+
160
+ # Convert to DataFrame to ensure shape consistency
161
+ X = pd.DataFrame([features_dict])
162
+
163
+ # Ensure all required columns exist
164
+ missing_cols = set(feature_names) - set(X.columns)
165
+ if missing_cols:
166
+ raise ValueError(f"❌ Missing required features: {missing_cols}")
167
+
168
+ # Keep only known features and order them correctly
169
+ X = X[feature_names]
170
+
171
+ # ------------------------------
172
+ # Level 0: Base model predictions
173
+ # ------------------------------
174
+ meta_features = np.zeros((X.shape[0], len(base_models)))
175
+ for idx, (model_name, model) in enumerate(base_models.items()):
176
+ meta_features[:, idx] = model.predict_proba(X)[:, 1]
177
+
178
+ meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
179
+
180
+ # ------------------------------
181
+ # Level 1: Meta-model prediction
182
+ # ------------------------------
183
+ meta_scaled = meta_scaler.transform(meta_features_df)
184
+ meta_scaled = pd.DataFrame(meta_scaled, columns=meta_features_df.columns)
185
+
186
+ final_pred = meta_model.predict(meta_scaled)[0]
187
+ final_prob = meta_model.predict_proba(meta_scaled)[:, 1][0]
188
+
189
+ return {
190
+ "predicted_label": int(final_pred),
191
+ "phish_probability": float(final_prob)
192
+ }
model/url_stacking_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc4e81eb5ce124016facc45fbe74d8b71f250c7676003b00d17f67bb730b5840
3
+ size 279828900
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI and web server
2
+ fastapi
3
+ uvicorn[standard]
4
+
5
+ # Data processing and ML
6
+ pandas==2.2.2
7
+ numpy==2.0.2
8
+ scikit-learn==1.6.1
9
+ lightgbm==4.6.0
10
+ xgboost==3.0.5
11
+ joblib==1.5.2
12
+
13
+ # Feature extraction dependencies
14
+ requests
15
+ beautifulsoup4
16
+ urllib3