#!/usr/bin/env python3 """ Performance comparison between legacy and optimized scrapers """ import time from concurrent.futures import ThreadPoolExecutor from clickloom_scrape import scraper as optimized_scraper from legacy_scraper import legacy_scraper def time_function(func, *args, **kwargs): """Time a function execution""" start_time = time.time() result = func(*args, **kwargs) end_time = time.time() return result, end_time - start_time def compare_single_scrape(): """Compare single scrape performance""" print("=== Single Scrape Comparison ===") test_url = "https://httpbin.org/html" # Test legacy scraper print("Testing legacy scraper...") try: legacy_result, legacy_time = time_function(legacy_scraper, test_url) print(f"Legacy scraper: {legacy_time:.2f} seconds") legacy_success = True except Exception as e: print(f"Legacy scraper failed: {e}") legacy_time = float('inf') legacy_success = False # Test optimized scraper print("Testing optimized scraper...") try: optimized_result, optimized_time = time_function(optimized_scraper, test_url) print(f"Optimized scraper: {optimized_time:.2f} seconds") optimized_success = True except Exception as e: print(f"Optimized scraper failed: {e}") optimized_time = float('inf') optimized_success = False if legacy_success and optimized_success: improvement = ((legacy_time - optimized_time) / legacy_time) * 100 print(f"Performance improvement: {improvement:.1f}%") if optimized_time < legacy_time: print(f"Optimized scraper is {legacy_time / optimized_time:.1f}x faster") else: print(f"Legacy scraper is {optimized_time / legacy_time:.1f}x faster") return legacy_time, optimized_time def compare_repeated_scrapes(): """Compare repeated scrapes to show driver pooling benefits""" print("\n=== Repeated Scrapes Comparison ===") test_url = "https://httpbin.org/html" num_scrapes = 3 # Test legacy scraper (creates new driver each time) print(f"Testing legacy scraper ({num_scrapes} scrapes)...") legacy_times = [] for i in range(num_scrapes): try: _, scrape_time = time_function(legacy_scraper, test_url) legacy_times.append(scrape_time) print(f" Scrape {i+1}: {scrape_time:.2f} seconds") except Exception as e: print(f" Scrape {i+1} failed: {e}") legacy_times.append(float('inf')) legacy_total = sum(t for t in legacy_times if t != float('inf')) legacy_avg = legacy_total / len([t for t in legacy_times if t != float('inf')]) if legacy_times else 0 # Test optimized scraper (reuses drivers) print(f"Testing optimized scraper ({num_scrapes} scrapes)...") optimized_times = [] for i in range(num_scrapes): try: _, scrape_time = time_function(optimized_scraper, test_url) optimized_times.append(scrape_time) print(f" Scrape {i+1}: {scrape_time:.2f} seconds") except Exception as e: print(f" Scrape {i+1} failed: {e}") optimized_times.append(float('inf')) optimized_total = sum(t for t in optimized_times if t != float('inf')) optimized_avg = optimized_total / len([t for t in optimized_times if t != float('inf')]) if optimized_times else 0 print(f"\nLegacy total time: {legacy_total:.2f} seconds (avg: {legacy_avg:.2f}s)") print(f"Optimized total time: {optimized_total:.2f} seconds (avg: {optimized_avg:.2f}s)") if legacy_total > 0 and optimized_total > 0: improvement = ((legacy_total - optimized_total) / legacy_total) * 100 print(f"Total time improvement: {improvement:.1f}%") print(f"Speedup factor: {legacy_total / optimized_total:.1f}x") return legacy_total, optimized_total def compare_concurrent_scrapes(): """Compare concurrent scraping performance""" print("\n=== Concurrent Scrapes Comparison ===") test_urls = [ "https://httpbin.org/html", "https://httpbin.org/json", "https://httpbin.org/xml" ] # Test legacy scraper concurrently print(f"Testing legacy scraper ({len(test_urls)} concurrent scrapes)...") start_time = time.time() try: with ThreadPoolExecutor(max_workers=3) as executor: legacy_futures = [executor.submit(legacy_scraper, url) for url in test_urls] legacy_results = [future.result() for future in legacy_futures] legacy_concurrent_time = time.time() - start_time print(f"Legacy concurrent time: {legacy_concurrent_time:.2f} seconds") legacy_concurrent_success = True except Exception as e: print(f"Legacy concurrent scraping failed: {e}") legacy_concurrent_time = float('inf') legacy_concurrent_success = False # Test optimized scraper concurrently print(f"Testing optimized scraper ({len(test_urls)} concurrent scrapes)...") start_time = time.time() try: with ThreadPoolExecutor(max_workers=3) as executor: optimized_futures = [executor.submit(optimized_scraper, url) for url in test_urls] optimized_results = [future.result() for future in optimized_futures] optimized_concurrent_time = time.time() - start_time print(f"Optimized concurrent time: {optimized_concurrent_time:.2f} seconds") optimized_concurrent_success = True except Exception as e: print(f"Optimized concurrent scraping failed: {e}") optimized_concurrent_time = float('inf') optimized_concurrent_success = False if legacy_concurrent_success and optimized_concurrent_success: improvement = ((legacy_concurrent_time - optimized_concurrent_time) / legacy_concurrent_time) * 100 print(f"Concurrent performance improvement: {improvement:.1f}%") print(f"Concurrent speedup factor: {legacy_concurrent_time / optimized_concurrent_time:.1f}x") return legacy_concurrent_time, optimized_concurrent_time if __name__ == "__main__": print("šŸš€ Scraper Performance Comparison\n") try: # Single scrape comparison legacy_single, optimized_single = compare_single_scrape() # Repeated scrapes comparison legacy_repeated, optimized_repeated = compare_repeated_scrapes() # Concurrent scrapes comparison legacy_concurrent, optimized_concurrent = compare_concurrent_scrapes() print("\n" + "="*50) print("šŸ“Š PERFORMANCE SUMMARY") print("="*50) if legacy_single != float('inf') and optimized_single != float('inf'): single_improvement = ((legacy_single - optimized_single) / legacy_single) * 100 print(f"Single scrape improvement: {single_improvement:.1f}%") if legacy_repeated != float('inf') and optimized_repeated != float('inf'): repeated_improvement = ((legacy_repeated - optimized_repeated) / legacy_repeated) * 100 print(f"Repeated scrapes improvement: {repeated_improvement:.1f}%") if legacy_concurrent != float('inf') and optimized_concurrent != float('inf'): concurrent_improvement = ((legacy_concurrent - optimized_concurrent) / legacy_concurrent) * 100 print(f"Concurrent scrapes improvement: {concurrent_improvement:.1f}%") print("\nšŸŽÆ KEY OPTIMIZATIONS:") print("• Driver pooling eliminates repeated initialization overhead") print("• Smart waiting replaces fixed 2-second delays") print("• Bulk JavaScript operations for faster element extraction") print("• Performance-optimized Chrome flags") print("• Proper timeout handling prevents hanging") print("• Thread-safe concurrent processing") except Exception as e: print(f"Comparison failed: {e}") print("Make sure you have internet connection and all dependencies installed.")