"""
Download HTML Content from Verified Online Phishing URLs

This script downloads HTML content from phishing URLs that are verified and online.
Saves HTML files for later feature extraction.
"""

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
import hashlib
import logging
from datetime import datetime
from bs4 import BeautifulSoup
import re
import urllib3
import random
from collections import defaultdict
from threading import Lock
import json

# Disable SSL warnings (expected when downloading phishing sites with invalid certificates)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%H:%M:%S'
)
logger = logging.getLogger("html_downloader")


class HTMLDownloader:
    """Optimized HTML downloader with retry, checkpointing, and rate limiting."""
    
    def __init__(self, output_dir='data/html', max_workers=20, timeout=8, checkpoint_interval=100):
        """
        Initialize optimized HTML downloader.
        
        Args:
            output_dir: Base directory to save HTML files
            max_workers: Number of parallel download threads (increased to 20)
            timeout: Request timeout in seconds (reduced to 8s for faster failure)
            checkpoint_interval: Save progress every N URLs
        """
        self.output_dir = Path(output_dir)
        self.legit_dir = self.output_dir / 'legitimate'
        self.phishing_dir = self.output_dir / 'phishing'
        self.legit_dir.mkdir(parents=True, exist_ok=True)
        self.phishing_dir.mkdir(parents=True, exist_ok=True)
        self.max_workers = max_workers
        self.timeout = timeout
        self.checkpoint_interval = checkpoint_interval
        
        # Stats
        self.stats = {
            'total': 0,
            'success': 0,
            'failed': 0,
            'timeout': 0,
            'error': 0,
            'retried': 0,
            'http_fallback': 0
        }
        
        # User agents rotation (avoid blocks)
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0',
        ]
        
        # Domain rate limiting (delay per domain)
        self.domain_last_access = defaultdict(float)
        self.domain_lock = Lock()
        self.min_domain_delay = 0.5  # 500ms between requests to same domain
        
        # Session pool for connection reuse
        self.sessions = []
        for _ in range(max_workers):
            session = self._create_session()
            self.sessions.append(session)
        
        # Checkpoint file
        self.checkpoint_file = self.output_dir / 'download_checkpoint.json'
        self.completed_urls = self._load_checkpoint()
    
    def _create_session(self):
        """Create optimized requests session with retry and compression."""
        session = requests.Session()
        
        # Retry strategy: 3 retries with exponential backoff
        retry_strategy = Retry(
            total=3,
            backoff_factor=0.5,  # 0.5s, 1s, 2s
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["GET", "HEAD"]
        )
        
        adapter = HTTPAdapter(
            max_retries=retry_strategy,
            pool_connections=100,
            pool_maxsize=100,
            pool_block=False
        )
        
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        
        # Enable compression
        session.headers.update({
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Connection': 'keep-alive',
        })
        
        return session
    
    def _get_random_user_agent(self):
        """Get random user agent to avoid detection."""
        return random.choice(self.user_agents)
    
    def _load_checkpoint(self):
        """Load checkpoint of already downloaded URLs."""
        if self.checkpoint_file.exists():
            try:
                with open(self.checkpoint_file, 'r') as f:
                    data = json.load(f)
                    completed = set(data.get('completed_urls', []))
                    logger.info(f"Loaded checkpoint: {len(completed):,} URLs already downloaded")
                    return completed
            except Exception as e:
                logger.warning(f"Failed to load checkpoint: {e}")
        return set()
    
    def _save_checkpoint(self, results):
        """Save checkpoint of completed URLs."""
        try:
            completed = [r['url'] for r in results if r['status'] == 'success']
            self.completed_urls.update(completed)
            
            with open(self.checkpoint_file, 'w') as f:
                json.dump({
                    'completed_urls': list(self.completed_urls),
                    'timestamp': datetime.now().isoformat(),
                    'total_completed': len(self.completed_urls)
                }, f)
        except Exception as e:
            logger.warning(f"Failed to save checkpoint: {e}")
    
    def _rate_limit_domain(self, url):
        """Apply per-domain rate limiting."""
        try:
            from urllib.parse import urlparse
            domain = urlparse(url).netloc
            
            with self.domain_lock:
                last_access = self.domain_last_access[domain]
                now = time.time()
                time_since_last = now - last_access
                
                if time_since_last < self.min_domain_delay:
                    sleep_time = self.min_domain_delay - time_since_last
                    time.sleep(sleep_time)
                
                self.domain_last_access[domain] = time.time()
        except:
            pass  # If rate limiting fails, continue anyway
    
    def _url_to_filename(self, url):
        """Convert URL to safe filename using hash."""
        url_hash = hashlib.md5(url.encode()).hexdigest()
        return f"{url_hash}.html"
    
    def _optimize_html(self, html_content):
        """
        Aggressively optimize HTML for feature extraction.
        
        Removes unnecessary content while preserving structure:
        - Comments, excessive whitespace
        - Inline styles (keeps style tags for counting)
        - Large script/style content (keeps tags for counting)
        - Base64 embedded images (huge size, not needed for features)
        
        Args:
            html_content: Raw HTML content
            
        Returns:
            Optimized HTML string (typically 60-80% smaller)
        """
        try:
            # Quick regex cleanup before parsing (faster than BeautifulSoup for some tasks)
            # Remove HTML comments
            html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
            
            # Remove base64 embedded images (can be huge, not needed for features)
            html_content = re.sub(r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+', 'data:image', html_content)
            
            # Parse HTML (use lxml parser if available, it's faster)
            try:
                soup = BeautifulSoup(html_content, 'lxml')
            except:
                soup = BeautifulSoup(html_content, 'html.parser')
            
            # Remove inline styles (but keep style tags for counting)
            for tag in soup.find_all(style=True):
                del tag['style']
            
            # Truncate large script/style content (keep tags for counting, trim content)
            for script in soup.find_all('script'):
                if script.string and len(script.string) > 500:
                    script.string = script.string[:500] + '...'
            
            for style in soup.find_all('style'):
                if style.string and len(style.string) > 500:
                    style.string = style.string[:500] + '...'
            
            # Normalize whitespace in text nodes
            for text in soup.find_all(string=True):
                if text.parent.name not in ['script', 'style']: # type: ignore
                    normalized = re.sub(r'\s+', ' ', str(text).strip())
                    if normalized:
                        text.replace_with(normalized)
            
            # Convert back to string
            optimized = str(soup)
            
            # Final cleanup: remove excessive blank lines
            optimized = re.sub(r'\n\s*\n+', '\n', optimized)
            
            return optimized
            
        except Exception as e:
            logger.warning(f"HTML optimization failed: {e}, returning original")
            # Fallback: at least remove comments and excessive whitespace
            html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
            html_content = re.sub(r'\n\s*\n+', '\n', html_content)
            return html_content
    
    def download_single_url(self, url, label, url_id=None, session=None):
        """
        Download HTML with retry logic and HTTP fallback.
        
        Args:
            url: URL to download
            label: Label (0=legitimate, 1=phishing)
            url_id: Optional ID from dataset
            session: Requests session (for connection pooling)
            
        Returns:
            Dictionary with download result
        """
        result = {
            'url': url,
            'label': label,
            'url_id': url_id,
            'status': 'failed',
            'error': None,
            'filename': None,
            'size': 0,
            'original_size': 0
        }
        
        # Skip if already downloaded
        if url in self.completed_urls:
            result['status'] = 'skipped'
            result['error'] = 'Already downloaded'
            return result
        
        # Apply rate limiting
        self._rate_limit_domain(url)
        
        # Use provided session or create temporary one
        if session is None:
            session = self._create_session()
        
        # Add scheme if missing (default HTTPS)
        original_url = url
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        attempts = [url]
        
        # If HTTPS, also try HTTP as fallback
        if url.startswith('https://'):
            http_url = url.replace('https://', 'http://', 1)
            attempts.append(http_url)
        
        # Try each URL variant
        for attempt_num, attempt_url in enumerate(attempts):
            try:
                # Random user agent for each attempt
                headers = {'User-Agent': self._get_random_user_agent()}
                
                # Download with timeout and retries (handled by session)
                response = session.get(
                    attempt_url,
                    headers=headers,
                    timeout=(3, self.timeout),  # (connect timeout, read timeout)
                    allow_redirects=True,
                    verify=False,  # Phishing sites often have invalid SSL
                    stream=False  # We need full content
                )
                
                # Check if successful
                if response.status_code == 200:
                    # Check content type (skip if not HTML)
                    content_type = response.headers.get('Content-Type', '')
                    if 'text/html' not in content_type.lower() and 'application/xhtml' not in content_type.lower():
                        result['status'] = 'failed'
                        result['error'] = f'Non-HTML content: {content_type}'
                        continue
                    
                    # Get HTML content
                    html_content = response.text
                    result['original_size'] = len(html_content)
                    
                    # Skip if too small (likely error page)
                    if len(html_content) < 200:
                        result['status'] = 'failed'
                        result['error'] = 'HTML too small (< 200 bytes)'
                        continue
                    
                    # Optimize HTML for feature extraction
                    optimized_html = self._optimize_html(html_content)
                    
                    # Save to appropriate directory
                    filename = self._url_to_filename(original_url)
                    target_dir = self.legit_dir if label == 0 else self.phishing_dir
                    filepath = target_dir / filename
                    
                    with open(filepath, 'w', encoding='utf-8', errors='ignore') as f:
                        f.write(optimized_html)
                    
                    result['status'] = 'success'
                    result['filename'] = filename
                    result['size'] = len(optimized_html)
                    result['target_dir'] = str(target_dir.name)
                    result['compression_ratio'] = f"{(1 - len(optimized_html) / max(result['original_size'], 1)) * 100:.1f}%"
                    
                    if attempt_num > 0:
                        result['http_fallback'] = True
                        self.stats['http_fallback'] += 1
                    
                    self.stats['success'] += 1
                    return result  # Success!
                    
                else:
                    result['error'] = f"HTTP {response.status_code}"
                    if attempt_num == len(attempts) - 1:  # Last attempt
                        result['status'] = 'failed'
                        self.stats['failed'] += 1
                    
            except requests.Timeout:
                result['error'] = 'Timeout'
                if attempt_num == len(attempts) - 1:
                    result['status'] = 'timeout'
                    self.stats['timeout'] += 1
                    
            except requests.RequestException as e:
                result['error'] = f"{type(e).__name__}: {str(e)[:80]}"
                if attempt_num == len(attempts) - 1:
                    result['status'] = 'error'
                    self.stats['error'] += 1
                    
            except Exception as e:
                result['error'] = f"Unknown: {str(e)[:80]}"
                if attempt_num == len(attempts) - 1:
                    result['status'] = 'error'
                    self.stats['error'] += 1
        
        return result
    
    def download_batch(self, urls_df, label_column='label', id_column=None, resume=True):
        """
        Download HTML with checkpointing and session pooling.
        
        Args:
            urls_df: DataFrame with URLs
            label_column: Column name for labels
            id_column: Optional column name for IDs
            resume: Resume from checkpoint if available
            
        Returns:
            DataFrame with download results
        """
        self.stats['total'] = len(urls_df)
        
        # Filter already downloaded URLs if resuming
        if resume and self.completed_urls:
            url_column = 'url' if 'url' in urls_df.columns else 'URL'
            urls_df = urls_df[~urls_df[url_column].isin(self.completed_urls)].copy()
            skipped = self.stats['total'] - len(urls_df)
            if skipped > 0:
                logger.info(f"Resuming: {skipped:,} URLs already downloaded, {len(urls_df):,} remaining")
        
        logger.info(f"Starting optimized download of {len(urls_df):,} URLs...")
        logger.info(f"Workers: {self.max_workers} | Timeout: {self.timeout}s | Checkpoint: every {self.checkpoint_interval} URLs")
        logger.info(f"Output: {self.output_dir.absolute()}")
        logger.info(f"Features: Session pooling, retry logic, HTTP fallback, rate limiting, compression")
        
        results = []
        session_idx = 0
        
        # Prepare tasks
        tasks = []
        for idx, row in urls_df.iterrows():
            url = row['url'] if 'url' in row else row['URL']
            label = row[label_column] if label_column in row else 1
            url_id = row[id_column] if id_column and id_column in row else idx
            tasks.append((url, label, url_id))
        
        # Download in parallel with progress bar and checkpointing
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit tasks with session pooling
            future_to_task = {}
            for url, label, url_id in tasks:
                # Round-robin session assignment
                session = self.sessions[session_idx % len(self.sessions)]
                session_idx += 1
                
                future = executor.submit(self.download_single_url, url, label, url_id, session)
                future_to_task[future] = (url, label, url_id)
            
            # Process completed tasks with progress bar
            with tqdm(total=len(tasks), desc="Downloading", unit="url") as pbar:
                checkpoint_counter = 0
                
                for future in as_completed(future_to_task):
                    result = future.result()
                    results.append(result)
                    pbar.update(1)
                    
                    checkpoint_counter += 1
                    
                    # Save checkpoint periodically
                    if checkpoint_counter >= self.checkpoint_interval:
                        self._save_checkpoint(results)
                        checkpoint_counter = 0
                    
                    # Update progress bar with detailed stats
                    pbar.set_postfix({
                        'OK': self.stats['success'],
                        'Fail': self.stats['failed'],
                        'Timeout': self.stats['timeout'],
                        'HTTP↓': self.stats['http_fallback']
                    })
        
        # Final checkpoint save
        self._save_checkpoint(results)
        
        # Create results DataFrame
        results_df = pd.DataFrame(results)
        
        # Print summary
        self._print_summary(results_df)
        
        return results_df
    
    def _print_summary(self, results_df):
        """Print detailed download summary with optimization metrics."""
        logger.info("\n" + "="*80)
        logger.info("DOWNLOAD SUMMARY")
        logger.info("="*80)
        
        total = self.stats['total']
        success = self.stats['success']
        
        logger.info(f"\nTotal URLs processed: {total:,}")
        logger.info(f"  ✓ Successful:  {success:,} ({success/max(total,1)*100:.1f}%)")
        logger.info(f"  ✗ Failed:      {self.stats['failed']:,}")
        logger.info(f"  ⏱ Timeout:     {self.stats['timeout']:,}")
        logger.info(f"  ⚠ Error:       {self.stats['error']:,}")
        logger.info(f"  ↓ HTTP Fallback: {self.stats['http_fallback']:,}")
        
        # Detailed stats if we have results
        if not results_df.empty and 'status' in results_df.columns:
            # Success by label
            if 'label' in results_df.columns:
                success_by_label = results_df[results_df['status'] == 'success'].groupby('label').size()
                if not success_by_label.empty:
                    logger.info(f"\nSuccessful downloads by type:")
                    for label, count in success_by_label.items():
                        label_name = 'Phishing' if label == 1 else 'Legitimate'
                        logger.info(f"  {label_name}: {count:,}")
            
            # Size statistics
            successful = results_df[results_df['status'] == 'success']
            if not successful.empty and 'size' in successful.columns:
                total_optimized = successful['size'].sum()
                total_original = successful.get('original_size', successful['size']).sum()
                
                logger.info(f"\nStorage statistics:")
                logger.info(f"  Original size:  {total_original/1024/1024:.2f} MB")
                logger.info(f"  Optimized size: {total_optimized/1024/1024:.2f} MB")
                if total_original > 0:
                    saved = (1 - total_optimized / total_original) * 100
                    logger.info(f"  Space saved:    {saved:.1f}%")
            
            # Error breakdown
            failed = results_df[results_df['status'] != 'success']
            if not failed.empty and 'error' in failed.columns:
                error_counts = failed['error'].value_counts().head(5)
                if not error_counts.empty:
                    logger.info(f"\nTop failure reasons:")
                    for error, count in error_counts.items():
                        logger.info(f"  {error}: {count:,}")
        
        logger.info("="*80)


def main():
    """Main function to download HTML from verified online phishing URLs."""
    import argparse
    
    parser = argparse.ArgumentParser(description='Download HTML content from URLs and organize by label')
    parser.add_argument('--input', type=str, default='data/processed/clean_dataset.csv',
                        help='Input CSV file with URLs (must have url,label,type columns)')
    parser.add_argument('--output', type=str, default='data/html',
                        help='Base output directory (will create legitimate/ and phishing/ subdirectories)')
    parser.add_argument('--workers', type=int, default=20,
                        help='Number of parallel download workers (default: 20)')
    parser.add_argument('--timeout', type=int, default=8,
                        help='Request timeout in seconds (default: 8s)')
    parser.add_argument('--checkpoint', type=int, default=100,
                        help='Save progress every N URLs (default: 100)')
    parser.add_argument('--resume', action='store_true', default=True,
                        help='Resume from checkpoint (default: True)')
    parser.add_argument('--no-resume', dest='resume', action='store_false',
                        help='Start fresh, ignore checkpoint')
    parser.add_argument('--limit', type=int, default=None,
                        help='Limit number of URLs to download (for testing)')
    parser.add_argument('--balance', action='store_true',
                        help='Download equal number of legitimate and phishing URLs')
    
    args = parser.parse_args()
    
    logger.info("="*80)
    logger.info("HTML CONTENT DOWNLOADER - Phishing Detection")
    logger.info("="*80)
    
    # Load URLs
    script_dir = Path(__file__).parent.parent.parent
    input_path = (script_dir / args.input).resolve()
    
    logger.info(f"\nLoading URLs from: {input_path}")
    df = pd.read_csv(input_path)
    logger.info(f"Loaded: {len(df):,} URLs")
    
    # Show columns
    logger.info(f"Columns: {list(df.columns)}")
    
    # Verify required columns
    if 'url' not in df.columns and 'URL' not in df.columns:
        logger.error("No 'url' or 'URL' column found in dataset!")
        return
    
    if 'label' not in df.columns:
        logger.error("No 'label' column found in dataset!")
        return
    
    # Show label distribution
    logger.info(f"\nLabel distribution in dataset:")
    label_counts = df['label'].value_counts()
    for label, count in label_counts.items():
        label_name = 'Legitimate' if label == 0 else 'Phishing'
        logger.info(f"  {label_name} (label={label}): {count:,}")
    
    # Balance dataset if requested
    if args.balance:
        min_count = label_counts.min()
        df_balanced = pd.concat([
            df[df['label'] == 0].sample(n=min(min_count, len(df[df['label'] == 0])), random_state=42),
            df[df['label'] == 1].sample(n=min(min_count, len(df[df['label'] == 1])), random_state=42)
        ]).sample(frac=1, random_state=42).reset_index(drop=True)
        df = df_balanced
        logger.info(f"\nBalanced dataset to {min_count:,} samples per class")
        logger.info(f"Total URLs after balancing: {len(df):,}")
    
    # Limit for testing
    if args.limit:
        df = df.head(args.limit)
        logger.info(f"Limited to first {args.limit:,} URLs for testing")
    
    # Initialize optimized downloader
    output_dir = (script_dir / args.output).resolve()
    downloader = HTMLDownloader(
        output_dir=output_dir,
        max_workers=args.workers,
        timeout=args.timeout,
        checkpoint_interval=args.checkpoint
    )
    
    # Download HTML content with checkpointing
    results_df = downloader.download_batch(
        df,
        label_column='label' if 'label' in df.columns else None,  # type: ignore
        id_column='phish_id' if 'phish_id' in df.columns else None,  # type: ignore
        resume=args.resume
    )
    
    # Save results
    results_file = output_dir / f'download_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
    results_df.to_csv(results_file, index=False)
    logger.info(f"\n✓ Results saved to: {results_file}")
    
    # Save metadata mapping (URL to filename)
    metadata = results_df[results_df['status'] == 'success'][['url', 'label', 'filename', 'url_id']]
    metadata_file = output_dir / 'html_metadata.csv'
    metadata.to_csv(metadata_file, index=False)
    logger.info(f"✓ Metadata saved to: {metadata_file}")
    
    logger.info("\n" + "="*80)
    logger.info("✓ HTML DOWNLOAD COMPLETE!")
    logger.info("="*80)
    logger.info(f"\nFiles saved to:")
    logger.info(f"  Legitimate: {output_dir / 'legitimate'}")
    logger.info(f"  Phishing: {output_dir / 'phishing'}")
    logger.info(f"\nHTML files have been optimized for feature extraction:")
    logger.info(f"  - Comments removed")
    logger.info(f"  - Whitespace normalized")
    logger.info(f"  - Inline styles removed")
    logger.info(f"  - Structure preserved for feature extraction")
    logger.info("="*80)


if __name__ == "__main__":
    main()