Spaces:
Build error
Build error
| import pickle | |
| from urllib.parse import urlparse | |
| import gradio as gr | |
| import pandas as pd | |
| # Load the trained model and scaler | |
| with open('phishing_model.pkl', 'rb') as model_file: | |
| model = pickle.load(model_file) | |
| with open('scaler.pkl', 'rb') as scaler_file: | |
| scaler = pickle.load(scaler_file) | |
| # Function to extract features from a URL | |
| selected_features = ['length_url', | |
| 'length_hostname', | |
| 'ip', | |
| 'nb_dots', | |
| 'nb_qm', | |
| 'nb_eq', | |
| 'nb_slash', | |
| 'nb_www', | |
| 'ratio_digits_url', | |
| 'ratio_digits_host', | |
| 'tld_in_subdomain', | |
| 'prefix_suffix', | |
| 'shortest_word_host', | |
| 'longest_words_raw', | |
| 'longest_word_path', | |
| 'phish_hints', | |
| 'nb_hyperlinks', | |
| 'ratio_intHyperlinks', | |
| 'empty_title', | |
| 'domain_in_title', | |
| 'domain_age', | |
| 'google_index', | |
| 'page_rank'] | |
| def extract_features(url): | |
| url_features = { | |
| # ... (existing feature extraction code) | |
| 'length_url': len(url), | |
| 'length_hostname': len(urlparse(url).netloc), | |
| 'ip': int(urlparse(url).netloc.split('.')[0].isdigit()), | |
| 'nb_dots': url.count('.'), | |
| 'nb_hyphens': url.count('-'), | |
| 'nb_at': url.count('@'), | |
| 'nb_qm': url.count('?'), | |
| 'nb_and': url.count('&'), | |
| 'nb_or': url.count('|'), | |
| 'nb_eq': url.count('='), | |
| 'nb_underscore': url.count('_'), | |
| 'nb_tilde': url.count('~'), | |
| 'nb_percent': url.count('%'), | |
| 'nb_slash': url.count('/'), | |
| 'nb_star': url.count('*'), | |
| 'nb_colon': url.count(':'), | |
| 'nb_comma': url.count(','), | |
| 'nb_semicolumn': url.count(';'), | |
| 'nb_dollar': url.count('$'), | |
| 'nb_space': url.count(' '), | |
| 'nb_www': int('www.' in url), | |
| 'nb_com': int('.com' in url), | |
| 'nb_dslash': int('//' in url), | |
| 'http_in_path': int('http' in url), | |
| 'https_token': int('https' in url), | |
| 'ratio_digits_url': sum(char.isdigit() for char in url) / len(url), | |
| 'ratio_digits_host': sum(char.isdigit() for char in urlparse(url).netloc) / len(urlparse(url).netloc), | |
| 'punycode': int(url.startswith('xn--')), | |
| 'shortening_service': int(any(service in url for service in ['bit.ly', 'goo.gl', 't.co', 'ow.ly', 'y2u.be'])), | |
| 'path_extension': int(any(url.endswith(ext) for ext in ['.php', '.asp', '.jsp', '.do'])), | |
| 'phish_hints': int(any(hint in url for hint in ['login', 'account', 'verify', 'confirm'])), | |
| 'domain_in_brand': int(any(brand in url for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])), | |
| 'brand_in_subdomain': int(any(brand in urlparse(url).netloc.split('.') for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])), | |
| 'brand_in_path': int(any(brand in url.split('/') for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])), | |
| 'suspecious_tld': int(urlparse(url).netloc.split('.')[-1] in ['info', 'xyz', 'top', 'club']) | |
| } | |
| return url_features | |
| # Function to make a prediction | |
| def predict_phishing(url): | |
| url_features = extract_features(url) | |
| url_data = pd.DataFrame([url_features], columns=selected_features) | |
| url_data_scaled = scaler.transform(url_data) | |
| prediction = model.predict(url_data_scaled) | |
| return "Phishing" if prediction[0] == 1 else "Legitimate" | |
| # Create the Gradio interface | |
| demo = gr.Interface( | |
| fn=predict_phishing, | |
| inputs=gr.Textbox(label="URL"), | |
| outputs=gr.Textbox(label="Prediction"), | |
| title="Phishing URL Detection", | |
| description="Enter a URL and the model will predict if it's legitimate or phishing." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |