Spaces:

apexherbert200
/

selenium-scraper

No application file

File size: 8,190 Bytes

f2c46e7

#!/usr/bin/env python3
"""
Performance comparison between legacy and optimized scrapers
"""

import time
from concurrent.futures import ThreadPoolExecutor
from clickloom_scrape import scraper as optimized_scraper
from legacy_scraper import legacy_scraper

def time_function(func, *args, **kwargs):
    """Time a function execution"""
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    return result, end_time - start_time

def compare_single_scrape():
    """Compare single scrape performance"""
    print("=== Single Scrape Comparison ===")
    
    test_url = "https://httpbin.org/html"
    
    # Test legacy scraper
    print("Testing legacy scraper...")
    try:
        legacy_result, legacy_time = time_function(legacy_scraper, test_url)
        print(f"Legacy scraper: {legacy_time:.2f} seconds")
        legacy_success = True
    except Exception as e:
        print(f"Legacy scraper failed: {e}")
        legacy_time = float('inf')
        legacy_success = False
    
    # Test optimized scraper
    print("Testing optimized scraper...")
    try:
        optimized_result, optimized_time = time_function(optimized_scraper, test_url)
        print(f"Optimized scraper: {optimized_time:.2f} seconds")
        optimized_success = True
    except Exception as e:
        print(f"Optimized scraper failed: {e}")
        optimized_time = float('inf')
        optimized_success = False
    
    if legacy_success and optimized_success:
        improvement = ((legacy_time - optimized_time) / legacy_time) * 100
        print(f"Performance improvement: {improvement:.1f}%")
        
        if optimized_time < legacy_time:
            print(f"Optimized scraper is {legacy_time / optimized_time:.1f}x faster")
        else:
            print(f"Legacy scraper is {optimized_time / legacy_time:.1f}x faster")
    
    return legacy_time, optimized_time

def compare_repeated_scrapes():
    """Compare repeated scrapes to show driver pooling benefits"""
    print("\n=== Repeated Scrapes Comparison ===")
    
    test_url = "https://httpbin.org/html"
    num_scrapes = 3
    
    # Test legacy scraper (creates new driver each time)
    print(f"Testing legacy scraper ({num_scrapes} scrapes)...")
    legacy_times = []
    
    for i in range(num_scrapes):
        try:
            _, scrape_time = time_function(legacy_scraper, test_url)
            legacy_times.append(scrape_time)
            print(f"  Scrape {i+1}: {scrape_time:.2f} seconds")
        except Exception as e:
            print(f"  Scrape {i+1} failed: {e}")
            legacy_times.append(float('inf'))
    
    legacy_total = sum(t for t in legacy_times if t != float('inf'))
    legacy_avg = legacy_total / len([t for t in legacy_times if t != float('inf')]) if legacy_times else 0
    
    # Test optimized scraper (reuses drivers)
    print(f"Testing optimized scraper ({num_scrapes} scrapes)...")
    optimized_times = []
    
    for i in range(num_scrapes):
        try:
            _, scrape_time = time_function(optimized_scraper, test_url)
            optimized_times.append(scrape_time)
            print(f"  Scrape {i+1}: {scrape_time:.2f} seconds")
        except Exception as e:
            print(f"  Scrape {i+1} failed: {e}")
            optimized_times.append(float('inf'))
    
    optimized_total = sum(t for t in optimized_times if t != float('inf'))
    optimized_avg = optimized_total / len([t for t in optimized_times if t != float('inf')]) if optimized_times else 0
    
    print(f"\nLegacy total time: {legacy_total:.2f} seconds (avg: {legacy_avg:.2f}s)")
    print(f"Optimized total time: {optimized_total:.2f} seconds (avg: {optimized_avg:.2f}s)")
    
    if legacy_total > 0 and optimized_total > 0:
        improvement = ((legacy_total - optimized_total) / legacy_total) * 100
        print(f"Total time improvement: {improvement:.1f}%")
        print(f"Speedup factor: {legacy_total / optimized_total:.1f}x")
    
    return legacy_total, optimized_total

def compare_concurrent_scrapes():
    """Compare concurrent scraping performance"""
    print("\n=== Concurrent Scrapes Comparison ===")
    
    test_urls = [
        "https://httpbin.org/html",
        "https://httpbin.org/json",
        "https://httpbin.org/xml"
    ]
    
    # Test legacy scraper concurrently
    print(f"Testing legacy scraper ({len(test_urls)} concurrent scrapes)...")
    start_time = time.time()
    
    try:
        with ThreadPoolExecutor(max_workers=3) as executor:
            legacy_futures = [executor.submit(legacy_scraper, url) for url in test_urls]
            legacy_results = [future.result() for future in legacy_futures]
        legacy_concurrent_time = time.time() - start_time
        print(f"Legacy concurrent time: {legacy_concurrent_time:.2f} seconds")
        legacy_concurrent_success = True
    except Exception as e:
        print(f"Legacy concurrent scraping failed: {e}")
        legacy_concurrent_time = float('inf')
        legacy_concurrent_success = False
    
    # Test optimized scraper concurrently
    print(f"Testing optimized scraper ({len(test_urls)} concurrent scrapes)...")
    start_time = time.time()
    
    try:
        with ThreadPoolExecutor(max_workers=3) as executor:
            optimized_futures = [executor.submit(optimized_scraper, url) for url in test_urls]
            optimized_results = [future.result() for future in optimized_futures]
        optimized_concurrent_time = time.time() - start_time
        print(f"Optimized concurrent time: {optimized_concurrent_time:.2f} seconds")
        optimized_concurrent_success = True
    except Exception as e:
        print(f"Optimized concurrent scraping failed: {e}")
        optimized_concurrent_time = float('inf')
        optimized_concurrent_success = False
    
    if legacy_concurrent_success and optimized_concurrent_success:
        improvement = ((legacy_concurrent_time - optimized_concurrent_time) / legacy_concurrent_time) * 100
        print(f"Concurrent performance improvement: {improvement:.1f}%")
        print(f"Concurrent speedup factor: {legacy_concurrent_time / optimized_concurrent_time:.1f}x")
    
    return legacy_concurrent_time, optimized_concurrent_time

if __name__ == "__main__":
    print("🚀 Scraper Performance Comparison\n")
    
    try:
        # Single scrape comparison
        legacy_single, optimized_single = compare_single_scrape()
        
        # Repeated scrapes comparison
        legacy_repeated, optimized_repeated = compare_repeated_scrapes()
        
        # Concurrent scrapes comparison
        legacy_concurrent, optimized_concurrent = compare_concurrent_scrapes()
        
        print("\n" + "="*50)
        print("📊 PERFORMANCE SUMMARY")
        print("="*50)
        
        if legacy_single != float('inf') and optimized_single != float('inf'):
            single_improvement = ((legacy_single - optimized_single) / legacy_single) * 100
            print(f"Single scrape improvement: {single_improvement:.1f}%")
        
        if legacy_repeated != float('inf') and optimized_repeated != float('inf'):
            repeated_improvement = ((legacy_repeated - optimized_repeated) / legacy_repeated) * 100
            print(f"Repeated scrapes improvement: {repeated_improvement:.1f}%")
        
        if legacy_concurrent != float('inf') and optimized_concurrent != float('inf'):
            concurrent_improvement = ((legacy_concurrent - optimized_concurrent) / legacy_concurrent) * 100
            print(f"Concurrent scrapes improvement: {concurrent_improvement:.1f}%")
        
        print("\n🎯 KEY OPTIMIZATIONS:")
        print("• Driver pooling eliminates repeated initialization overhead")
        print("• Smart waiting replaces fixed 2-second delays")
        print("• Bulk JavaScript operations for faster element extraction")
        print("• Performance-optimized Chrome flags")
        print("• Proper timeout handling prevents hanging")
        print("• Thread-safe concurrent processing")
        
    except Exception as e:
        print(f"Comparison failed: {e}")
        print("Make sure you have internet connection and all dependencies installed.")