import pickle from urllib.parse import urlparse import gradio as gr import pandas as pd # Load the trained model and scaler with open('phishing_model.pkl', 'rb') as model_file: model = pickle.load(model_file) with open('scaler.pkl', 'rb') as scaler_file: scaler = pickle.load(scaler_file) # Function to extract features from a URL selected_features = ['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_qm', 'nb_eq', 'nb_slash', 'nb_www', 'ratio_digits_url', 'ratio_digits_host', 'tld_in_subdomain', 'prefix_suffix', 'shortest_word_host', 'longest_words_raw', 'longest_word_path', 'phish_hints', 'nb_hyperlinks', 'ratio_intHyperlinks', 'empty_title', 'domain_in_title', 'domain_age', 'google_index', 'page_rank'] def extract_features(url): url_features = { # ... (existing feature extraction code) 'length_url': len(url), 'length_hostname': len(urlparse(url).netloc), 'ip': int(urlparse(url).netloc.split('.')[0].isdigit()), 'nb_dots': url.count('.'), 'nb_hyphens': url.count('-'), 'nb_at': url.count('@'), 'nb_qm': url.count('?'), 'nb_and': url.count('&'), 'nb_or': url.count('|'), 'nb_eq': url.count('='), 'nb_underscore': url.count('_'), 'nb_tilde': url.count('~'), 'nb_percent': url.count('%'), 'nb_slash': url.count('/'), 'nb_star': url.count('*'), 'nb_colon': url.count(':'), 'nb_comma': url.count(','), 'nb_semicolumn': url.count(';'), 'nb_dollar': url.count('$'), 'nb_space': url.count(' '), 'nb_www': int('www.' in url), 'nb_com': int('.com' in url), 'nb_dslash': int('//' in url), 'http_in_path': int('http' in url), 'https_token': int('https' in url), 'ratio_digits_url': sum(char.isdigit() for char in url) / len(url), 'ratio_digits_host': sum(char.isdigit() for char in urlparse(url).netloc) / len(urlparse(url).netloc), 'punycode': int(url.startswith('xn--')), 'shortening_service': int(any(service in url for service in ['bit.ly', 'goo.gl', 't.co', 'ow.ly', 'y2u.be'])), 'path_extension': int(any(url.endswith(ext) for ext in ['.php', '.asp', '.jsp', '.do'])), 'phish_hints': int(any(hint in url for hint in ['login', 'account', 'verify', 'confirm'])), 'domain_in_brand': int(any(brand in url for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])), 'brand_in_subdomain': int(any(brand in urlparse(url).netloc.split('.') for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])), 'brand_in_path': int(any(brand in url.split('/') for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])), 'suspecious_tld': int(urlparse(url).netloc.split('.')[-1] in ['info', 'xyz', 'top', 'club']) } return url_features # Function to make a prediction def predict_phishing(url): url_features = extract_features(url) url_data = pd.DataFrame([url_features], columns=selected_features) url_data_scaled = scaler.transform(url_data) prediction = model.predict(url_data_scaled) return "Phishing" if prediction[0] == 1 else "Legitimate" # Create the Gradio interface demo = gr.Interface( fn=predict_phishing, inputs=gr.Textbox(label="URL"), outputs=gr.Textbox(label="Prediction"), title="Phishing URL Detection", description="Enter a URL and the model will predict if it's legitimate or phishing." ) if __name__ == "__main__": demo.launch()