import pickle
from urllib.parse import urlparse
import gradio as gr
import pandas as pd

# Load the trained model and scaler
with open('phishing_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

# Function to extract features from a URL

selected_features = ['length_url',
                     'length_hostname',
                     'ip',
                     'nb_dots',
                     'nb_qm',
                     'nb_eq',
                     'nb_slash',
                     'nb_www',
                     'ratio_digits_url',
                     'ratio_digits_host',
                     'tld_in_subdomain',
                     'prefix_suffix',
                     'shortest_word_host',
                     'longest_words_raw',
                     'longest_word_path',
                     'phish_hints',
                     'nb_hyperlinks',
                     'ratio_intHyperlinks',
                     'empty_title',
                     'domain_in_title',
                     'domain_age',
                     'google_index',
                     'page_rank']


def extract_features(url):
    url_features = {
        # ... (existing feature extraction code)
        'length_url': len(url),
        'length_hostname': len(urlparse(url).netloc),
        'ip': int(urlparse(url).netloc.split('.')[0].isdigit()),
        'nb_dots': url.count('.'),
        'nb_hyphens': url.count('-'),
        'nb_at': url.count('@'),
        'nb_qm': url.count('?'),
        'nb_and': url.count('&'),
        'nb_or': url.count('|'),
        'nb_eq': url.count('='),
        'nb_underscore': url.count('_'),
        'nb_tilde': url.count('~'),
        'nb_percent': url.count('%'),
        'nb_slash': url.count('/'),
        'nb_star': url.count('*'),
        'nb_colon': url.count(':'),
        'nb_comma': url.count(','),
        'nb_semicolumn': url.count(';'),
        'nb_dollar': url.count('$'),
        'nb_space': url.count(' '),
        'nb_www': int('www.' in url),
        'nb_com': int('.com' in url),
        'nb_dslash': int('//' in url),
        'http_in_path': int('http' in url),
        'https_token': int('https' in url),
        'ratio_digits_url': sum(char.isdigit() for char in url) / len(url),
        'ratio_digits_host': sum(char.isdigit() for char in urlparse(url).netloc) / len(urlparse(url).netloc),
        'punycode': int(url.startswith('xn--')),
        'shortening_service': int(any(service in url for service in ['bit.ly', 'goo.gl', 't.co', 'ow.ly', 'y2u.be'])),
        'path_extension': int(any(url.endswith(ext) for ext in ['.php', '.asp', '.jsp', '.do'])),
        'phish_hints': int(any(hint in url for hint in ['login', 'account', 'verify', 'confirm'])),
        'domain_in_brand': int(any(brand in url for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])),
        'brand_in_subdomain': int(any(brand in urlparse(url).netloc.split('.') for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])),
        'brand_in_path': int(any(brand in url.split('/') for brand in ['google', 'paypal', 'facebook', 'ebay', 'amazon'])),
        'suspecious_tld': int(urlparse(url).netloc.split('.')[-1] in ['info', 'xyz', 'top', 'club'])
    }
    return url_features


# Function to make a prediction
def predict_phishing(url):
    url_features = extract_features(url)
    url_data = pd.DataFrame([url_features], columns=selected_features)
    url_data_scaled = scaler.transform(url_data)
    prediction = model.predict(url_data_scaled)
    return "Phishing" if prediction[0] == 1 else "Legitimate"


# Create the Gradio interface
demo = gr.Interface(
    fn=predict_phishing,
    inputs=gr.Textbox(label="URL"),
    outputs=gr.Textbox(label="Prediction"),
    title="Phishing URL Detection",
    description="Enter a URL and the model will predict if it's legitimate or phishing."
)

if __name__ == "__main__":
    demo.launch()