Spaces:

eddiebee
/

phishing_detection_model_playground

Build error

App Files Files Community

phishing_detection_model_playground / app.py

eddiebee

Upload 3 files

1fef3dc verified about 1 year ago

raw

history blame contribute delete

4.09 kB

	import pickle
	from urllib.parse import urlparse
	import gradio as gr
	import pandas as pd

	# Load the trained model and scaler
	with open('phishing_model.pkl', 'rb') as model_file:
	model = pickle.load(model_file)

	with open('scaler.pkl', 'rb') as scaler_file:
	scaler = pickle.load(scaler_file)

	# Function to extract features from a URL

	selected_features = ['length_url',
	'length_hostname',
	'ip',
	'nb_dots',
	'nb_qm',
	'nb_eq',
	'nb_slash',
	'nb_www',
	'ratio_digits_url',
	'ratio_digits_host',
	'tld_in_subdomain',
	'prefix_suffix',
	'shortest_word_host',
	'longest_words_raw',
	'longest_word_path',
	'phish_hints',
	'nb_hyperlinks',
	'ratio_intHyperlinks',
	'empty_title',
	'domain_in_title',
	'domain_age',
	'google_index',
	'page_rank']


	def extract_features(url):
	url_features = {
	# ... (existing feature extraction code)
	'length_url': len(url),
	'length_hostname': len(urlparse(url).netloc),
	'ip': int(urlparse(url).netloc.split('.')[0].isdigit()),
	'nb_dots': url.count('.'),
	'nb_hyphens': url.count('-'),
	'nb_at': url.count('@'),
	'nb_qm': url.count('?'),
	'nb_and': url.count('&'),
	'nb_or': url.count('\|'),
	'nb_eq': url.count('='),
	'nb_underscore': url.count('_'),
	'nb_tilde': url.count('~'),
	'nb_percent': url.count('%'),
	'nb_slash': url.count('/'),
	'nb_star': url.count('*'),
	'nb_colon': url.count(':'),
	'nb_comma': url.count(','),
	'nb_semicolumn': url.count(';'),
	'nb_dollar': url.count('$'),
	'nb_space': url.count(' '),
	'nb_www': int('www.' in url),
	'nb_com': int('.com' in url),
	'nb_dslash': int('//' in url),
	'http_in_path': int('http' in url),
	'https_token': int('https' in url),
	'ratio_digits_url': sum(char.isdigit() for char in url) / len(url),
	'ratio_digits_host': sum(char.isdigit() for char in urlparse(url).netloc) / len(urlparse(url).netloc),
	'punycode': int(url.startswith('xn--')),
	'shortening_service': int(any(service in url for service in ['bit.ly', 'goo.gl', 't.co', 'ow.ly', 'y2u.be'])),
	'path_extension': int(any(url.endswith(ext) for ext in ['.php', '.asp', '.jsp', '.do'])),
	'phish_hints': int(any(hint in url for hint in ['login', 'account', 'verify', 'confirm'])),
	'domain_in_brand': int(any(brand in url for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])),
	'brand_in_subdomain': int(any(brand in urlparse(url).netloc.split('.') for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])),
	'brand_in_path': int(any(brand in url.split('/') for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])),
	'suspecious_tld': int(urlparse(url).netloc.split('.')[-1] in ['info', 'xyz', 'top', 'club'])
	}
	return url_features


	# Function to make a prediction
	def predict_phishing(url):
	url_features = extract_features(url)
	url_data = pd.DataFrame([url_features], columns=selected_features)
	url_data_scaled = scaler.transform(url_data)
	prediction = model.predict(url_data_scaled)
	return "Phishing" if prediction[0] == 1 else "Legitimate"


	# Create the Gradio interface
	demo = gr.Interface(
	fn=predict_phishing,
	inputs=gr.Textbox(label="URL"),
	outputs=gr.Textbox(label="Prediction"),
	title="Phishing URL Detection",
	description="Enter a URL and the model will predict if it's legitimate or phishing."
	)

	if __name__ == "__main__":
	demo.launch()