Spaces:
Sleeping
Sleeping
albin
commited on
Commit
·
324e0e0
1
Parent(s):
4a2b74d
modify features encryption
Browse files- label_encoders.pkl +3 -0
- main.py +31 -15
label_encoders.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6828590f38440abf21c24157ec2c7ab8e25bf1100a277918afd2f5ead03f6912
|
| 3 |
+
size 7719889
|
main.py
CHANGED
|
@@ -8,6 +8,8 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
| 8 |
from pydantic import BaseModel
|
| 9 |
import pickle
|
| 10 |
import joblib
|
|
|
|
|
|
|
| 11 |
from extraction_features import extract_features
|
| 12 |
|
| 13 |
app = FastAPI()
|
|
@@ -24,7 +26,13 @@ app.add_middleware(
|
|
| 24 |
# model_file = open('logistic_regression_model.pkl', 'rb')
|
| 25 |
# model = pickle.load(model_file, encoding='bytes')
|
| 26 |
model = joblib.load('logistic_regression_model.pkl')
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
class Msg(BaseModel):
|
| 30 |
msg: str
|
|
@@ -73,7 +81,17 @@ async def predict(request: Request, requess: Req = Depends(form_req)):
|
|
| 73 |
url = requess.url
|
| 74 |
|
| 75 |
features = extract_features(str(url))
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
# data.append(str(features['URL']))
|
| 78 |
# data.extend(int(features['URLLength']))
|
| 79 |
# data.extend(str(features['Domain']))
|
|
@@ -85,19 +103,17 @@ async def predict(request: Request, requess: Req = Depends(form_req)):
|
|
| 85 |
# data.extend(float(features['DegitRatioInURL']))
|
| 86 |
# data.extend(float(features['SpacialCharRatioInURL']))
|
| 87 |
# data.extend(int(features['IsHTTPS']))
|
| 88 |
-
data.append(features['URL'])
|
| 89 |
-
data.append(features['URLLength'])
|
| 90 |
-
data.append(features['Domain'])
|
| 91 |
-
data.append(features['DomainLength'])
|
| 92 |
-
data.append(features['TLD'])
|
| 93 |
-
data.append(features['CharContinuationRate'])
|
| 94 |
-
data.append(features['TLDLength'])
|
| 95 |
-
data.append(features['NoOfSubDomain'])
|
| 96 |
-
data.append(features['DegitRatioInURL'])
|
| 97 |
-
data.append(features['SpacialCharRatioInURL'])
|
| 98 |
-
data.append(features['IsHTTPS'])
|
| 99 |
-
# Convert every feature into numeric value
|
| 100 |
-
data = [float(x) if isinstance(x, str) else x for x in data]
|
| 101 |
|
| 102 |
prediction = model.predict([data])
|
| 103 |
output = prediction[0]
|
|
|
|
| 8 |
from pydantic import BaseModel
|
| 9 |
import pickle
|
| 10 |
import joblib
|
| 11 |
+
import pandas as pd
|
| 12 |
+
from sklearn.preprocessing import LabelEncoder
|
| 13 |
from extraction_features import extract_features
|
| 14 |
|
| 15 |
app = FastAPI()
|
|
|
|
| 26 |
# model_file = open('logistic_regression_model.pkl', 'rb')
|
| 27 |
# model = pickle.load(model_file, encoding='bytes')
|
| 28 |
model = joblib.load('logistic_regression_model.pkl')
|
| 29 |
+
label_encoders = joblib.load('label_encoders.pkl')
|
| 30 |
+
# Columns used in the model
|
| 31 |
+
selected_columns = [
|
| 32 |
+
'URL', 'URLLength', 'Domain', 'DomainLength', 'TLD',
|
| 33 |
+
'CharContinuationRate', 'TLDLength', 'NoOfSubDomain',
|
| 34 |
+
'DegitRatioInURL', 'SpacialCharRatioInURL', 'IsHTTPS'
|
| 35 |
+
]
|
| 36 |
|
| 37 |
class Msg(BaseModel):
|
| 38 |
msg: str
|
|
|
|
| 81 |
url = requess.url
|
| 82 |
|
| 83 |
features = extract_features(str(url))
|
| 84 |
+
dataFrame_features = pd.DataFrame([features])
|
| 85 |
+
|
| 86 |
+
# Apply features encoding (convert everything into int64)
|
| 87 |
+
for column in selected_columns:
|
| 88 |
+
if column in label_encoders:
|
| 89 |
+
encoder = label_encoders[column]
|
| 90 |
+
dataFrame_features[column] = encoder.transform(dataFrame_features[column])
|
| 91 |
+
|
| 92 |
+
data = dataFrame_features[selected_columns].values
|
| 93 |
+
|
| 94 |
+
# data = []
|
| 95 |
# data.append(str(features['URL']))
|
| 96 |
# data.extend(int(features['URLLength']))
|
| 97 |
# data.extend(str(features['Domain']))
|
|
|
|
| 103 |
# data.extend(float(features['DegitRatioInURL']))
|
| 104 |
# data.extend(float(features['SpacialCharRatioInURL']))
|
| 105 |
# data.extend(int(features['IsHTTPS']))
|
| 106 |
+
# data.append(features['URL'])
|
| 107 |
+
# data.append(features['URLLength'])
|
| 108 |
+
# data.append(features['Domain'])
|
| 109 |
+
# data.append(features['DomainLength'])
|
| 110 |
+
# data.append(features['TLD'])
|
| 111 |
+
# data.append(features['CharContinuationRate'])
|
| 112 |
+
# data.append(features['TLDLength'])
|
| 113 |
+
# data.append(features['NoOfSubDomain'])
|
| 114 |
+
# data.append(features['DegitRatioInURL'])
|
| 115 |
+
# data.append(features['SpacialCharRatioInURL'])
|
| 116 |
+
# data.append(features['IsHTTPS'])
|
|
|
|
|
|
|
| 117 |
|
| 118 |
prediction = model.predict([data])
|
| 119 |
output = prediction[0]
|