selenium-scraper / performance_comparison.py
apexherbert200's picture
First commit
f2c46e7
#!/usr/bin/env python3
"""
Performance comparison between legacy and optimized scrapers
"""
import time
from concurrent.futures import ThreadPoolExecutor
from clickloom_scrape import scraper as optimized_scraper
from legacy_scraper import legacy_scraper
def time_function(func, *args, **kwargs):
"""Time a function execution"""
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
return result, end_time - start_time
def compare_single_scrape():
"""Compare single scrape performance"""
print("=== Single Scrape Comparison ===")
test_url = "https://httpbin.org/html"
# Test legacy scraper
print("Testing legacy scraper...")
try:
legacy_result, legacy_time = time_function(legacy_scraper, test_url)
print(f"Legacy scraper: {legacy_time:.2f} seconds")
legacy_success = True
except Exception as e:
print(f"Legacy scraper failed: {e}")
legacy_time = float('inf')
legacy_success = False
# Test optimized scraper
print("Testing optimized scraper...")
try:
optimized_result, optimized_time = time_function(optimized_scraper, test_url)
print(f"Optimized scraper: {optimized_time:.2f} seconds")
optimized_success = True
except Exception as e:
print(f"Optimized scraper failed: {e}")
optimized_time = float('inf')
optimized_success = False
if legacy_success and optimized_success:
improvement = ((legacy_time - optimized_time) / legacy_time) * 100
print(f"Performance improvement: {improvement:.1f}%")
if optimized_time < legacy_time:
print(f"Optimized scraper is {legacy_time / optimized_time:.1f}x faster")
else:
print(f"Legacy scraper is {optimized_time / legacy_time:.1f}x faster")
return legacy_time, optimized_time
def compare_repeated_scrapes():
"""Compare repeated scrapes to show driver pooling benefits"""
print("\n=== Repeated Scrapes Comparison ===")
test_url = "https://httpbin.org/html"
num_scrapes = 3
# Test legacy scraper (creates new driver each time)
print(f"Testing legacy scraper ({num_scrapes} scrapes)...")
legacy_times = []
for i in range(num_scrapes):
try:
_, scrape_time = time_function(legacy_scraper, test_url)
legacy_times.append(scrape_time)
print(f" Scrape {i+1}: {scrape_time:.2f} seconds")
except Exception as e:
print(f" Scrape {i+1} failed: {e}")
legacy_times.append(float('inf'))
legacy_total = sum(t for t in legacy_times if t != float('inf'))
legacy_avg = legacy_total / len([t for t in legacy_times if t != float('inf')]) if legacy_times else 0
# Test optimized scraper (reuses drivers)
print(f"Testing optimized scraper ({num_scrapes} scrapes)...")
optimized_times = []
for i in range(num_scrapes):
try:
_, scrape_time = time_function(optimized_scraper, test_url)
optimized_times.append(scrape_time)
print(f" Scrape {i+1}: {scrape_time:.2f} seconds")
except Exception as e:
print(f" Scrape {i+1} failed: {e}")
optimized_times.append(float('inf'))
optimized_total = sum(t for t in optimized_times if t != float('inf'))
optimized_avg = optimized_total / len([t for t in optimized_times if t != float('inf')]) if optimized_times else 0
print(f"\nLegacy total time: {legacy_total:.2f} seconds (avg: {legacy_avg:.2f}s)")
print(f"Optimized total time: {optimized_total:.2f} seconds (avg: {optimized_avg:.2f}s)")
if legacy_total > 0 and optimized_total > 0:
improvement = ((legacy_total - optimized_total) / legacy_total) * 100
print(f"Total time improvement: {improvement:.1f}%")
print(f"Speedup factor: {legacy_total / optimized_total:.1f}x")
return legacy_total, optimized_total
def compare_concurrent_scrapes():
"""Compare concurrent scraping performance"""
print("\n=== Concurrent Scrapes Comparison ===")
test_urls = [
"https://httpbin.org/html",
"https://httpbin.org/json",
"https://httpbin.org/xml"
]
# Test legacy scraper concurrently
print(f"Testing legacy scraper ({len(test_urls)} concurrent scrapes)...")
start_time = time.time()
try:
with ThreadPoolExecutor(max_workers=3) as executor:
legacy_futures = [executor.submit(legacy_scraper, url) for url in test_urls]
legacy_results = [future.result() for future in legacy_futures]
legacy_concurrent_time = time.time() - start_time
print(f"Legacy concurrent time: {legacy_concurrent_time:.2f} seconds")
legacy_concurrent_success = True
except Exception as e:
print(f"Legacy concurrent scraping failed: {e}")
legacy_concurrent_time = float('inf')
legacy_concurrent_success = False
# Test optimized scraper concurrently
print(f"Testing optimized scraper ({len(test_urls)} concurrent scrapes)...")
start_time = time.time()
try:
with ThreadPoolExecutor(max_workers=3) as executor:
optimized_futures = [executor.submit(optimized_scraper, url) for url in test_urls]
optimized_results = [future.result() for future in optimized_futures]
optimized_concurrent_time = time.time() - start_time
print(f"Optimized concurrent time: {optimized_concurrent_time:.2f} seconds")
optimized_concurrent_success = True
except Exception as e:
print(f"Optimized concurrent scraping failed: {e}")
optimized_concurrent_time = float('inf')
optimized_concurrent_success = False
if legacy_concurrent_success and optimized_concurrent_success:
improvement = ((legacy_concurrent_time - optimized_concurrent_time) / legacy_concurrent_time) * 100
print(f"Concurrent performance improvement: {improvement:.1f}%")
print(f"Concurrent speedup factor: {legacy_concurrent_time / optimized_concurrent_time:.1f}x")
return legacy_concurrent_time, optimized_concurrent_time
if __name__ == "__main__":
print("🚀 Scraper Performance Comparison\n")
try:
# Single scrape comparison
legacy_single, optimized_single = compare_single_scrape()
# Repeated scrapes comparison
legacy_repeated, optimized_repeated = compare_repeated_scrapes()
# Concurrent scrapes comparison
legacy_concurrent, optimized_concurrent = compare_concurrent_scrapes()
print("\n" + "="*50)
print("📊 PERFORMANCE SUMMARY")
print("="*50)
if legacy_single != float('inf') and optimized_single != float('inf'):
single_improvement = ((legacy_single - optimized_single) / legacy_single) * 100
print(f"Single scrape improvement: {single_improvement:.1f}%")
if legacy_repeated != float('inf') and optimized_repeated != float('inf'):
repeated_improvement = ((legacy_repeated - optimized_repeated) / legacy_repeated) * 100
print(f"Repeated scrapes improvement: {repeated_improvement:.1f}%")
if legacy_concurrent != float('inf') and optimized_concurrent != float('inf'):
concurrent_improvement = ((legacy_concurrent - optimized_concurrent) / legacy_concurrent) * 100
print(f"Concurrent scrapes improvement: {concurrent_improvement:.1f}%")
print("\n🎯 KEY OPTIMIZATIONS:")
print("• Driver pooling eliminates repeated initialization overhead")
print("• Smart waiting replaces fixed 2-second delays")
print("• Bulk JavaScript operations for faster element extraction")
print("• Performance-optimized Chrome flags")
print("• Proper timeout handling prevents hanging")
print("• Thread-safe concurrent processing")
except Exception as e:
print(f"Comparison failed: {e}")
print("Make sure you have internet connection and all dependencies installed.")