import numpy as np
import pandas as pd
import re
import gradio as gr
import requests
import json
import joblib
from urllib.parse import urlparse
import base64

feature_names = joblib.load('feature_names.joblib')
non_numeric_columns = joblib.load('non_numeric_columns.joblib')
scaler = joblib.load('scaler.joblib')

label_encoders = {}
for col in non_numeric_columns:
    try:
        le = joblib.load(f'le_{col}.joblib')
        label_encoders[col] = le
    except:
        pass

# Load AdaBoost model
model = joblib.load('AdaBoost_best_model.joblib')

# API
GOOGLE_API_KEY = "AIzaSyCmXigpl6t7MZyvq8Rv37XW_lZVCi0oMSM"
PHISHTANK_USER_AGENT = "phishtank/PHISH"
VIRUSTOTAL_API_KEY = "7c38bbc8b0f461b25b38d1cda16404fc8c3997c6b569a257707505759da7a996"
VIRUSTOTAL_BASE_URL = "https://www.virustotal.com/api/v3/"

vt_headers = {
    "x-apikey": VIRUSTOTAL_API_KEY,
    "Accept": "application/json"
}

def check_url_virustotal(target_url):
    try:
        url_id = base64.urlsafe_b64encode(target_url.encode()).decode().strip("=")
        
        url = f"{VIRUSTOTAL_BASE_URL}urls/{url_id}"
        response = requests.get(url, headers=vt_headers, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            
            malicious_count = 0
            phishing_keywords = ["phishing", "malicious"]
            
            for vendor, result_data in data['data']['attributes']['last_analysis_results'].items():
                if result_data['result'] and any(keyword in result_data['result'].lower() for keyword in phishing_keywords):
                    malicious_count += 1
            
            if malicious_count >= 5:
                return "Phishing"
            else:
                return "Legitimate"
        else:
            return "Legitimate"
    except:
        return "Legitimate"

def check_url_phishtank(url):
    endpoint = "https://checkurl.phishtank.com/checkurl/"
    payload = {"url": url, "format": "json"}
    headers = {"User-Agent": PHISHTANK_USER_AGENT}
    
    try:
        response = requests.post(endpoint, data=payload, headers=headers, timeout=5)
        response.raise_for_status()
        data = response.json()
        
        if data.get('results', {}).get('in_database', False) and data.get('results', {}).get('verified', False):
            return "Phishing"
        return "Legitimate"
    except Exception as e:
        return "Legitimate"

def check_url_safebrowsing(url):
    endpoint = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={GOOGLE_API_KEY}"
    payload = {
        "client": {"clientId": "PhishShield", "clientVersion": "1.0"},
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url}]
        }
    }
    try:
        response = requests.post(endpoint, json=payload, timeout=5)
        response.raise_for_status()
        data = response.json()
        
        if "matches" in data:
            return "Phishing"
        return "Legitimate"
    except Exception as e:
        return "Legitimate"

def extract_features(url):
    parsed = urlparse(url)
    hostname = parsed.hostname if parsed.hostname else ''
    path = parsed.path if parsed.path else ''
    
    entropy = 0.0
    if len(url) > 0:
        for c in set(url):
            p = url.count(c) / len(url)
            entropy -= p * np.log2(p) if p > 0 else 0
    
    features = {
        "length_url": len(url),
        "length_hostname": len(hostname),
        "ip": 1 if any(char.isdigit() for char in hostname) else 0,
        "nb_dots": url.count('.'),
        "nb_hyphens": url.count('-'),
        "nb_at": url.count('@'),
        "nb_qm": url.count('?'),
        "nb_and": url.count('&'),
        "nb_or": url.count('|'),
        "nb_eq": url.count('='),
        "nb_underscore": url.count('_'),
        "nb_tilde": url.count('~'),
        "nb_percent": url.count('%'),
        "nb_slash": url.count('/'),
        "nb_star": url.count('*'),
        "nb_colon": url.count(':'),
        "nb_comma": url.count(','),
        "nb_semicolumn": url.count(';'),
        "nb_dollar": url.count('$'),
        "nb_space": url.count(' '),
        "nb_www": 1 if "www" in url else 0,
        "nb_com": 1 if ".com" in url else 0,
        "nb_dslash": url.count('//'),
        "http_in_path": 1 if "http" in path else 0,
        "https_token": 1 if "https" in url else 0,
        "ratio_digits_url": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
        "ratio_digits_host": sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0,
        "punycode": 1 if re.search(r'xn--', url, re.IGNORECASE) else 0,
        "port": parsed.port if parsed.port else 0,
        "tld_in_path": 1 if any(tld in path for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
        "tld_in_subdomain": 1 if any(tld in hostname for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
        "abnormal_subdomain": 1 if len(hostname.split('.')) > 3 else 0,
        "nb_subdomains": len(hostname.split('.')) - 1,
        "prefix_suffix": 1 if url.startswith("www") else 0,
        "shortening_service": 1 if any(short in url for short in ['bit.ly', 'goo.gl', 'tinyurl.com']) else 0,
        "path_extension": 1 if any(ext in path for ext in ['.exe', '.zip', '.rar', '.tar', '.pdf']) else 0,
        "length_words_raw": len(url.split()),
        "char_repeat": len(set(url)),
        "shortest_words_raw": min(len(word) for word in url.split()) if url.split() else 0,
        "longest_words_raw": max(len(word) for word in url.split()) if url.split() else 0,
        "shortest_word_host": min(len(word) for word in hostname.split('.')) if hostname else 0,
        "longest_word_host": max(len(word) for word in hostname.split('.')) if hostname else 0,
        "shortest_word_path": min(len(word) for word in path.split('/')) if path else 0,
        "longest_word_path": max(len(word) for word in path.split('/')) if path else 0,
        "avg_words_raw": np.mean([len(word) for word in url.split()]) if url.split() else 0,
        "avg_word_host": np.mean([len(word) for word in hostname.split('.')]) if hostname else 0,
        "avg_word_path": np.mean([len(word) for word in path.split('/')]) if path else 0,
        "phish_hints": 1 if any(kw in url.lower() for kw in ['login', 'secure', 'verify', 'account']) else 0,
        "domain_in_brand": 1 if 'apple' in hostname.lower() else 0,
        "brand_in_subdomain": 1 if 'apple' in (hostname.split('.')[0] if hostname else '') else 0,
        "brand_in_path": 1 if 'apple' in path.lower() else 0,
        "suspicious_tld": 1 if hostname.endswith(('.xyz', '.top', '.club', '.gq', '.cf', '.tk')) else 0,
        "entropy": entropy
    }
    return features

def predict_with_model(url):
    try:
        features = extract_features(url)
        
        input_data = pd.DataFrame([features], columns=feature_names)
        
        for col in non_numeric_columns:
            if col in input_data.columns and col in label_encoders:
                try:
                    input_data[col] = label_encoders[col].transform(input_data[col].astype(str))
                except ValueError:
                    input_data[col] = len(label_encoders[col].classes_)
        
        input_scaled = scaler.transform(input_data)
        
        # AdaBoost model prediction (returns probability for class 1)
        phish_prob = model.predict_proba(input_scaled)[0][1]
        
        if phish_prob >= 0.95:
            return "Phishing"
        else:
            return "Legitimate"
    except Exception as e:
        return "Legitimate"

def predict_url(url):
    try:
        if not url.startswith(("http://", "https://")):
            url = "https://" + url
            
        # Check VirusTotal
        vt_result = check_url_virustotal(url)
        if vt_result == "Phishing":
            return "Phishing"
            
        # Check PhishTank
        pt_result = check_url_phishtank(url)
        if pt_result == "Phishing":
            return "Phishing"
            
        # Check Google Safe Browsing
        gsb_result = check_url_safebrowsing(url)
        if gsb_result == "Phishing":
            return "Phishing"
            
        # If all three services return "Legitimate", return "Legitimate" without running the model
        if vt_result == "Legitimate" and pt_result == "Legitimate" and gsb_result == "Legitimate":
            return "Legitimate"
            
        ml_result = predict_with_model(url)
        return ml_result
    
    except Exception as e:
        return "Legitimate"

# Gradio Interface
interface = gr.Interface(
    fn=predict_url,
    inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com"),
    outputs=gr.Textbox(label="Result"),
    title="Phishing URL Detector",
    description="Check if a URL is phishing or legitimate",
    examples=[
        ["https://www.apple.com"],
        ["https://login-facebook-secure.xyz/login.php"],
        ["https://bit.ly/suspicious-download"]
    ]
)

interface.launch(server_name="0.0.0.0", server_port=7860)