"""
URL Phishing Analyzer
Rule-based phishing detection with optional PhishTank API integration.
Computes a risk score (0-100) and confidence (0-1) with detailed reasons.
"""

import re
import asyncio
from datetime import datetime
from typing import Dict, List, Optional, Any
import logging

logger = logging.getLogger(__name__)

# Known suspicious TLDs frequently used in phishing
SUSPICIOUS_TLDS = {
    '.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.club',
    '.work', '.date', '.racing', '.download', '.win', '.bid',
    '.stream', '.trade', '.webcam', '.loan', '.party', '.click',
    '.link', '.info', '.zip', '.mov', '.php'
}

# URL shortener domains
URL_SHORTENERS = {
    'bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'is.gd',
    'buff.ly', 'j.mp', 'rb.gy', 'shorturl.at', 'tiny.cc', 'cutt.ly',
    's.id', 'v.gd', 'clck.ru', 'qr.ae'
}

# Suspicious keywords commonly found in phishing URLs
SUSPICIOUS_KEYWORDS = [
    'login', 'verify', 'account', 'secure', 'update', 'banking',
    'signin', 'confirm', 'password', 'credential', 'authenticate',
    'wallet', 'suspend', 'restrict', 'unlock', 'alert', 'notification',
    'paypal', 'appleid', 'microsoft', 'amazon', 'netflix', 'facebook',
    'instagram', 'wellsfargo', 'chase', 'citi'
]

# Well-known legitimate domains (whitelist)
TRUSTED_DOMAINS = {
    'google.com', 'www.google.com', 'youtube.com', 'www.youtube.com',
    'facebook.com', 'www.facebook.com', 'twitter.com', 'x.com',
    'github.com', 'www.github.com', 'stackoverflow.com',
    'microsoft.com', 'www.microsoft.com', 'apple.com', 'www.apple.com',
    'amazon.com', 'www.amazon.com', 'wikipedia.org', 'en.wikipedia.org',
    'linkedin.com', 'www.linkedin.com', 'reddit.com', 'www.reddit.com',
    'instagram.com', 'www.instagram.com', 'netflix.com', 'www.netflix.com',
    'whatsapp.com', 'web.whatsapp.com', 'mail.google.com',
    'outlook.com', 'outlook.live.com', 'drive.google.com',
    'docs.google.com', 'localhost'
}

# Homoglyph characters (look-alikes used to spoof domains)
HOMOGLYPH_PATTERN = re.compile(r'[а-яА-ЯёЁ\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]')


def analyze_url(url_string: str) -> Dict[str, Any]:
    """
    Analyze a URL for phishing indicators

    Args:
        url_string: The URL to analyze

    Returns:
        Dictionary with safe, riskScore, confidence, reasons, category, timestamp
    """
    reasons: List[str] = []
    risk_score = 0

    # Basic validation
    if not url_string or not isinstance(url_string, str):
        return {
            "url": url_string,
            "safe": False,
            "riskScore": 100,
            "confidence": 1.0,
            "reasons": ["Invalid or empty URL"],
            "category": "invalid",
            "timestamp": datetime.utcnow().isoformat()
        }

    # Parse URL
    try:
        # Handle URLs without protocol
        url_with_protocol = url_string if url_string.startswith('http') else f'http://{url_string}'
        from urllib.parse import urlparse
        parsed = urlparse(url_with_protocol)
    except Exception:
        return {
            "url": url_string,
            "safe": False,
            "riskScore": 80,
            "confidence": 0.9,
            "reasons": ["Malformed URL that cannot be parsed"],
            "category": "malformed",
            "timestamp": datetime.utcnow().isoformat()
        }

    hostname = parsed.hostname.lower() if parsed.hostname else ""
    full_url = url_string.lower()
    path = parsed.path.lower()

    # Whitelist check
    if hostname in TRUSTED_DOMAINS:
        return {
            "url": url_string,
            "safe": True,
            "riskScore": 0,
            "confidence": 0.95,
            "reasons": [],
            "category": "trusted",
            "timestamp": datetime.utcnow().isoformat()
        }

    # Check 1: IP address instead of domain
    ipv4_regex = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
    if ipv4_regex.match(hostname):
        risk_score += 30
        reasons.append("URL uses an IP address instead of a domain name — common in phishing")

    # Check 2: Suspicious TLD
    tld = '.' + (hostname.split('.')[-1] if hostname else "")
    if tld in SUSPICIOUS_TLDS:
        risk_score += 20
        reasons.append(f'Uses suspicious top-level domain "{tld}" — frequently abused for phishing')

    # Check 3: Excessive subdomains
    subdomain_count = len(hostname.split('.')) - 2 if hostname else 0
    if subdomain_count > 2:
        risk_score += 15
        reasons.append(f"Excessive subdomains ({subdomain_count + 2} levels) — used to disguise real domain")

    # Check 4: URL shortener
    if hostname in URL_SHORTENERS:
        risk_score += 25
        reasons.append("URL shortener detected — hides the actual destination, often used in phishing")

    # Check 5: Homoglyph / IDN characters
    if HOMOGLYPH_PATTERN.search(url_string):
        risk_score += 35
        reasons.append("Contains homoglyph/Cyrillic characters — used to impersonate legitimate domains")

    # Check 6: Very long URL
    if len(url_string) > 100:
        risk_score += 10
        reasons.append(f"Unusually long URL ({len(url_string)} characters) — may be hiding malicious content")

    # Check 7: Suspicious keywords in URL
    found_keywords = [kw for kw in SUSPICIOUS_KEYWORDS if kw in full_url]
    if found_keywords:
        keyword_score = min(len(found_keywords) * 8, 25)
        risk_score += keyword_score
        reasons.append(f"Contains suspicious keywords: {', '.join(found_keywords)} — common in phishing URLs")

    # Check 8: @ symbol in URL
    if '@' in url_string:
        risk_score += 25
        reasons.append('Contains "@" symbol — can redirect to a different domain than displayed')

    # Check 9: Data URI
    if url_string.lower().startswith('data:'):
        risk_score += 40
        reasons.append("Data URI detected — can embed malicious content without a server")

    # Check 10: HTTPS check
    if parsed.scheme == 'http' and 'localhost' not in hostname:
        risk_score += 40
        reasons.append("Uses HTTP instead of HTTPS — connection is not encrypted and insecure")

    # Check 11: Port number in URL
    if parsed.port and parsed.port not in [80, 443]:
        risk_score += 10
        reasons.append(f"Non-standard port (:{parsed.port}) — unusual for legitimate websites")

    # Check 12: Double extension in path
    double_ext_regex = re.compile(r'\.\w{2,4}\.\w{2,4}$')
    if double_ext_regex.search(path):
        risk_score += 20
        reasons.append("Double file extension detected in path — common trick to disguise malware")

    # Check 13: Encoded characters abuse
    encoded_count = len(re.findall(r'%[0-9a-fA-F]{2}', url_string))
    if encoded_count > 5:
        risk_score += 15
        reasons.append(f"Heavy URL encoding ({encoded_count} encoded chars) — may be obfuscating content")

    # Check 14: Hyphen abuse in domain
    hyphens = hostname.count('-') if hostname else 0
    if hyphens > 3:
        risk_score += 15
        reasons.append(f"Excessive hyphens in domain ({hyphens}) — common in phishing domains")

    # Cap score at 100
    risk_score = min(risk_score, 100)

    # Compute confidence based on number of signals
    confidence = min(0.3 + len(reasons) * 0.12, 1.0)

    # Determine category
    category = "safe"
    if risk_score >= 70:
        category = "high_risk"
    elif risk_score >= 40:
        category = "medium_risk"
    elif risk_score >= 20:
        category = "low_risk"

    return {
        "url": url_string,
        "safe": risk_score < 40,
        "riskScore": risk_score,
        "confidence": round(confidence, 2),
        "reasons": reasons,
        "category": category,
        "timestamp": datetime.utcnow().isoformat()
    }


async def check_phishtank(url_string: str, api_key: Optional[str] = None) -> Optional[Dict[str, Any]]:
    """
    Check URL against PhishTank database

    Args:
        url_string: URL to check
        api_key: PhishTank API key (optional)

    Returns:
        PhishTank result or None
    """
    try:
        import aiohttp

        params = {
            'url': url_string,
            'format': 'json'
        }
        if api_key:
            params['app_key'] = api_key

        async with aiohttp.ClientSession() as session:
            async with session.post(
                'http://checkurl.staging.phishtank.com/checkurl/',
                data=params,
                headers={'User-Agent': 'phishtank/wingineers'},
                timeout=aiohttp.ClientTimeout(total=5)
            ) as response:
                if response.status != 200:
                    return None

                data = await response.json()
                return {
                    "inDatabase": data.get('results', {}).get('in_database') in [True, 'true'],
                    "isPhish": data.get('results', {}).get('valid') in [True, 'y'],
                    "phishDetailUrl": data.get('results', {}).get('phish_detail_page')
                }
    except Exception as e:
        logger.warning(f"PhishTank lookup failed: {e}")
        return None


async def full_analysis(url_string: str, phish_tank_api_key: Optional[str] = None) -> Dict[str, Any]:
    """
    Full analysis: rule-based + PhishTank
    """
    result = analyze_url(url_string)

    # Run PhishTank check if API key is available
    phish_tank_result = await check_phishtank(url_string, phish_tank_api_key)

    if phish_tank_result:
        result["phishTank"] = phish_tank_result

        if phish_tank_result["isPhish"]:
            result["safe"] = False
            result["riskScore"] = max(result["riskScore"], 90)
            result["confidence"] = max(result["confidence"], 0.95)
            result["reasons"].append("⚠️ Confirmed phishing URL in PhishTank database")
            result["category"] = "high_risk"
        elif phish_tank_result["inDatabase"] and not phish_tank_result["isPhish"]:
            # In database but not confirmed as phish — lower risk slightly
            if result["riskScore"] > 20:
                result["riskScore"] = max(result["riskScore"] - 10, 0)

    return result