eddiebee's picture
Upload 3 files
1fef3dc verified
import pickle
from urllib.parse import urlparse
import gradio as gr
import pandas as pd
# Load the trained model and scaler
with open('phishing_model.pkl', 'rb') as model_file:
model = pickle.load(model_file)
with open('scaler.pkl', 'rb') as scaler_file:
scaler = pickle.load(scaler_file)
# Function to extract features from a URL
selected_features = ['length_url',
'length_hostname',
'ip',
'nb_dots',
'nb_qm',
'nb_eq',
'nb_slash',
'nb_www',
'ratio_digits_url',
'ratio_digits_host',
'tld_in_subdomain',
'prefix_suffix',
'shortest_word_host',
'longest_words_raw',
'longest_word_path',
'phish_hints',
'nb_hyperlinks',
'ratio_intHyperlinks',
'empty_title',
'domain_in_title',
'domain_age',
'google_index',
'page_rank']
def extract_features(url):
url_features = {
# ... (existing feature extraction code)
'length_url': len(url),
'length_hostname': len(urlparse(url).netloc),
'ip': int(urlparse(url).netloc.split('.')[0].isdigit()),
'nb_dots': url.count('.'),
'nb_hyphens': url.count('-'),
'nb_at': url.count('@'),
'nb_qm': url.count('?'),
'nb_and': url.count('&'),
'nb_or': url.count('|'),
'nb_eq': url.count('='),
'nb_underscore': url.count('_'),
'nb_tilde': url.count('~'),
'nb_percent': url.count('%'),
'nb_slash': url.count('/'),
'nb_star': url.count('*'),
'nb_colon': url.count(':'),
'nb_comma': url.count(','),
'nb_semicolumn': url.count(';'),
'nb_dollar': url.count('$'),
'nb_space': url.count(' '),
'nb_www': int('www.' in url),
'nb_com': int('.com' in url),
'nb_dslash': int('//' in url),
'http_in_path': int('http' in url),
'https_token': int('https' in url),
'ratio_digits_url': sum(char.isdigit() for char in url) / len(url),
'ratio_digits_host': sum(char.isdigit() for char in urlparse(url).netloc) / len(urlparse(url).netloc),
'punycode': int(url.startswith('xn--')),
'shortening_service': int(any(service in url for service in ['bit.ly', 'goo.gl', 't.co', 'ow.ly', 'y2u.be'])),
'path_extension': int(any(url.endswith(ext) for ext in ['.php', '.asp', '.jsp', '.do'])),
'phish_hints': int(any(hint in url for hint in ['login', 'account', 'verify', 'confirm'])),
'domain_in_brand': int(any(brand in url for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])),
'brand_in_subdomain': int(any(brand in urlparse(url).netloc.split('.') for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])),
'brand_in_path': int(any(brand in url.split('/') for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])),
'suspecious_tld': int(urlparse(url).netloc.split('.')[-1] in ['info', 'xyz', 'top', 'club'])
}
return url_features
# Function to make a prediction
def predict_phishing(url):
url_features = extract_features(url)
url_data = pd.DataFrame([url_features], columns=selected_features)
url_data_scaled = scaler.transform(url_data)
prediction = model.predict(url_data_scaled)
return "Phishing" if prediction[0] == 1 else "Legitimate"
# Create the Gradio interface
demo = gr.Interface(
fn=predict_phishing,
inputs=gr.Textbox(label="URL"),
outputs=gr.Textbox(label="Prediction"),
title="Phishing URL Detection",
description="Enter a URL and the model will predict if it's legitimate or phishing."
)
if __name__ == "__main__":
demo.launch()