Spaces:

apexherbert200
/

selenium-scraper

No application file

App Files Files Community

selenium-scraper / performance_comparison.py

apexherbert200

First commit

f2c46e7 8 months ago

raw

history blame contribute delete

8.19 kB

	#!/usr/bin/env python3
	"""
	Performance comparison between legacy and optimized scrapers
	"""

	import time
	from concurrent.futures import ThreadPoolExecutor
	from clickloom_scrape import scraper as optimized_scraper
	from legacy_scraper import legacy_scraper

	def time_function(func, args, *kwargs):
	"""Time a function execution"""
	start_time = time.time()
	result = func(args, *kwargs)
	end_time = time.time()
	return result, end_time - start_time

	def compare_single_scrape():
	"""Compare single scrape performance"""
	print("=== Single Scrape Comparison ===")

	test_url = "https://httpbin.org/html"

	# Test legacy scraper
	print("Testing legacy scraper...")
	try:
	legacy_result, legacy_time = time_function(legacy_scraper, test_url)
	print(f"Legacy scraper: {legacy_time:.2f} seconds")
	legacy_success = True
	except Exception as e:
	print(f"Legacy scraper failed: {e}")
	legacy_time = float('inf')
	legacy_success = False

	# Test optimized scraper
	print("Testing optimized scraper...")
	try:
	optimized_result, optimized_time = time_function(optimized_scraper, test_url)
	print(f"Optimized scraper: {optimized_time:.2f} seconds")
	optimized_success = True
	except Exception as e:
	print(f"Optimized scraper failed: {e}")
	optimized_time = float('inf')
	optimized_success = False

	if legacy_success and optimized_success:
	improvement = ((legacy_time - optimized_time) / legacy_time) * 100
	print(f"Performance improvement: {improvement:.1f}%")

	if optimized_time < legacy_time:
	print(f"Optimized scraper is {legacy_time / optimized_time:.1f}x faster")
	else:
	print(f"Legacy scraper is {optimized_time / legacy_time:.1f}x faster")

	return legacy_time, optimized_time

	def compare_repeated_scrapes():
	"""Compare repeated scrapes to show driver pooling benefits"""
	print("\n=== Repeated Scrapes Comparison ===")

	test_url = "https://httpbin.org/html"
	num_scrapes = 3

	# Test legacy scraper (creates new driver each time)
	print(f"Testing legacy scraper ({num_scrapes} scrapes)...")
	legacy_times = []

	for i in range(num_scrapes):
	try:
	_, scrape_time = time_function(legacy_scraper, test_url)
	legacy_times.append(scrape_time)
	print(f" Scrape {i+1}: {scrape_time:.2f} seconds")
	except Exception as e:
	print(f" Scrape {i+1} failed: {e}")
	legacy_times.append(float('inf'))

	legacy_total = sum(t for t in legacy_times if t != float('inf'))
	legacy_avg = legacy_total / len([t for t in legacy_times if t != float('inf')]) if legacy_times else 0

	# Test optimized scraper (reuses drivers)
	print(f"Testing optimized scraper ({num_scrapes} scrapes)...")
	optimized_times = []

	for i in range(num_scrapes):
	try:
	_, scrape_time = time_function(optimized_scraper, test_url)
	optimized_times.append(scrape_time)
	print(f" Scrape {i+1}: {scrape_time:.2f} seconds")
	except Exception as e:
	print(f" Scrape {i+1} failed: {e}")
	optimized_times.append(float('inf'))

	optimized_total = sum(t for t in optimized_times if t != float('inf'))
	optimized_avg = optimized_total / len([t for t in optimized_times if t != float('inf')]) if optimized_times else 0

	print(f"\nLegacy total time: {legacy_total:.2f} seconds (avg: {legacy_avg:.2f}s)")
	print(f"Optimized total time: {optimized_total:.2f} seconds (avg: {optimized_avg:.2f}s)")

	if legacy_total > 0 and optimized_total > 0:
	improvement = ((legacy_total - optimized_total) / legacy_total) * 100
	print(f"Total time improvement: {improvement:.1f}%")
	print(f"Speedup factor: {legacy_total / optimized_total:.1f}x")

	return legacy_total, optimized_total

	def compare_concurrent_scrapes():
	"""Compare concurrent scraping performance"""
	print("\n=== Concurrent Scrapes Comparison ===")

	test_urls = [
	"https://httpbin.org/html",
	"https://httpbin.org/json",
	"https://httpbin.org/xml"
	]

	# Test legacy scraper concurrently
	print(f"Testing legacy scraper ({len(test_urls)} concurrent scrapes)...")
	start_time = time.time()

	try:
	with ThreadPoolExecutor(max_workers=3) as executor:
	legacy_futures = [executor.submit(legacy_scraper, url) for url in test_urls]
	legacy_results = [future.result() for future in legacy_futures]
	legacy_concurrent_time = time.time() - start_time
	print(f"Legacy concurrent time: {legacy_concurrent_time:.2f} seconds")
	legacy_concurrent_success = True
	except Exception as e:
	print(f"Legacy concurrent scraping failed: {e}")
	legacy_concurrent_time = float('inf')
	legacy_concurrent_success = False

	# Test optimized scraper concurrently
	print(f"Testing optimized scraper ({len(test_urls)} concurrent scrapes)...")
	start_time = time.time()

	try:
	with ThreadPoolExecutor(max_workers=3) as executor:
	optimized_futures = [executor.submit(optimized_scraper, url) for url in test_urls]
	optimized_results = [future.result() for future in optimized_futures]
	optimized_concurrent_time = time.time() - start_time
	print(f"Optimized concurrent time: {optimized_concurrent_time:.2f} seconds")
	optimized_concurrent_success = True
	except Exception as e:
	print(f"Optimized concurrent scraping failed: {e}")
	optimized_concurrent_time = float('inf')
	optimized_concurrent_success = False

	if legacy_concurrent_success and optimized_concurrent_success:
	improvement = ((legacy_concurrent_time - optimized_concurrent_time) / legacy_concurrent_time) * 100
	print(f"Concurrent performance improvement: {improvement:.1f}%")
	print(f"Concurrent speedup factor: {legacy_concurrent_time / optimized_concurrent_time:.1f}x")

	return legacy_concurrent_time, optimized_concurrent_time

	if __name__ == "__main__":
	print("🚀 Scraper Performance Comparison\n")

	try:
	# Single scrape comparison
	legacy_single, optimized_single = compare_single_scrape()

	# Repeated scrapes comparison
	legacy_repeated, optimized_repeated = compare_repeated_scrapes()

	# Concurrent scrapes comparison
	legacy_concurrent, optimized_concurrent = compare_concurrent_scrapes()

	print("\n" + "="*50)
	print("📊 PERFORMANCE SUMMARY")
	print("="*50)

	if legacy_single != float('inf') and optimized_single != float('inf'):
	single_improvement = ((legacy_single - optimized_single) / legacy_single) * 100
	print(f"Single scrape improvement: {single_improvement:.1f}%")

	if legacy_repeated != float('inf') and optimized_repeated != float('inf'):
	repeated_improvement = ((legacy_repeated - optimized_repeated) / legacy_repeated) * 100
	print(f"Repeated scrapes improvement: {repeated_improvement:.1f}%")

	if legacy_concurrent != float('inf') and optimized_concurrent != float('inf'):
	concurrent_improvement = ((legacy_concurrent - optimized_concurrent) / legacy_concurrent) * 100
	print(f"Concurrent scrapes improvement: {concurrent_improvement:.1f}%")

	print("\n🎯 KEY OPTIMIZATIONS:")
	print("• Driver pooling eliminates repeated initialization overhead")
	print("• Smart waiting replaces fixed 2-second delays")
	print("• Bulk JavaScript operations for faster element extraction")
	print("• Performance-optimized Chrome flags")
	print("• Proper timeout handling prevents hanging")
	print("• Thread-safe concurrent processing")

	except Exception as e:
	print(f"Comparison failed: {e}")
	print("Make sure you have internet connection and all dependencies installed.")