Spaces:
No application file
No application file
| #!/usr/bin/env python3 | |
| """ | |
| Performance comparison between legacy and optimized scrapers | |
| """ | |
| import time | |
| from concurrent.futures import ThreadPoolExecutor | |
| from clickloom_scrape import scraper as optimized_scraper | |
| from legacy_scraper import legacy_scraper | |
| def time_function(func, *args, **kwargs): | |
| """Time a function execution""" | |
| start_time = time.time() | |
| result = func(*args, **kwargs) | |
| end_time = time.time() | |
| return result, end_time - start_time | |
| def compare_single_scrape(): | |
| """Compare single scrape performance""" | |
| print("=== Single Scrape Comparison ===") | |
| test_url = "https://httpbin.org/html" | |
| # Test legacy scraper | |
| print("Testing legacy scraper...") | |
| try: | |
| legacy_result, legacy_time = time_function(legacy_scraper, test_url) | |
| print(f"Legacy scraper: {legacy_time:.2f} seconds") | |
| legacy_success = True | |
| except Exception as e: | |
| print(f"Legacy scraper failed: {e}") | |
| legacy_time = float('inf') | |
| legacy_success = False | |
| # Test optimized scraper | |
| print("Testing optimized scraper...") | |
| try: | |
| optimized_result, optimized_time = time_function(optimized_scraper, test_url) | |
| print(f"Optimized scraper: {optimized_time:.2f} seconds") | |
| optimized_success = True | |
| except Exception as e: | |
| print(f"Optimized scraper failed: {e}") | |
| optimized_time = float('inf') | |
| optimized_success = False | |
| if legacy_success and optimized_success: | |
| improvement = ((legacy_time - optimized_time) / legacy_time) * 100 | |
| print(f"Performance improvement: {improvement:.1f}%") | |
| if optimized_time < legacy_time: | |
| print(f"Optimized scraper is {legacy_time / optimized_time:.1f}x faster") | |
| else: | |
| print(f"Legacy scraper is {optimized_time / legacy_time:.1f}x faster") | |
| return legacy_time, optimized_time | |
| def compare_repeated_scrapes(): | |
| """Compare repeated scrapes to show driver pooling benefits""" | |
| print("\n=== Repeated Scrapes Comparison ===") | |
| test_url = "https://httpbin.org/html" | |
| num_scrapes = 3 | |
| # Test legacy scraper (creates new driver each time) | |
| print(f"Testing legacy scraper ({num_scrapes} scrapes)...") | |
| legacy_times = [] | |
| for i in range(num_scrapes): | |
| try: | |
| _, scrape_time = time_function(legacy_scraper, test_url) | |
| legacy_times.append(scrape_time) | |
| print(f" Scrape {i+1}: {scrape_time:.2f} seconds") | |
| except Exception as e: | |
| print(f" Scrape {i+1} failed: {e}") | |
| legacy_times.append(float('inf')) | |
| legacy_total = sum(t for t in legacy_times if t != float('inf')) | |
| legacy_avg = legacy_total / len([t for t in legacy_times if t != float('inf')]) if legacy_times else 0 | |
| # Test optimized scraper (reuses drivers) | |
| print(f"Testing optimized scraper ({num_scrapes} scrapes)...") | |
| optimized_times = [] | |
| for i in range(num_scrapes): | |
| try: | |
| _, scrape_time = time_function(optimized_scraper, test_url) | |
| optimized_times.append(scrape_time) | |
| print(f" Scrape {i+1}: {scrape_time:.2f} seconds") | |
| except Exception as e: | |
| print(f" Scrape {i+1} failed: {e}") | |
| optimized_times.append(float('inf')) | |
| optimized_total = sum(t for t in optimized_times if t != float('inf')) | |
| optimized_avg = optimized_total / len([t for t in optimized_times if t != float('inf')]) if optimized_times else 0 | |
| print(f"\nLegacy total time: {legacy_total:.2f} seconds (avg: {legacy_avg:.2f}s)") | |
| print(f"Optimized total time: {optimized_total:.2f} seconds (avg: {optimized_avg:.2f}s)") | |
| if legacy_total > 0 and optimized_total > 0: | |
| improvement = ((legacy_total - optimized_total) / legacy_total) * 100 | |
| print(f"Total time improvement: {improvement:.1f}%") | |
| print(f"Speedup factor: {legacy_total / optimized_total:.1f}x") | |
| return legacy_total, optimized_total | |
| def compare_concurrent_scrapes(): | |
| """Compare concurrent scraping performance""" | |
| print("\n=== Concurrent Scrapes Comparison ===") | |
| test_urls = [ | |
| "https://httpbin.org/html", | |
| "https://httpbin.org/json", | |
| "https://httpbin.org/xml" | |
| ] | |
| # Test legacy scraper concurrently | |
| print(f"Testing legacy scraper ({len(test_urls)} concurrent scrapes)...") | |
| start_time = time.time() | |
| try: | |
| with ThreadPoolExecutor(max_workers=3) as executor: | |
| legacy_futures = [executor.submit(legacy_scraper, url) for url in test_urls] | |
| legacy_results = [future.result() for future in legacy_futures] | |
| legacy_concurrent_time = time.time() - start_time | |
| print(f"Legacy concurrent time: {legacy_concurrent_time:.2f} seconds") | |
| legacy_concurrent_success = True | |
| except Exception as e: | |
| print(f"Legacy concurrent scraping failed: {e}") | |
| legacy_concurrent_time = float('inf') | |
| legacy_concurrent_success = False | |
| # Test optimized scraper concurrently | |
| print(f"Testing optimized scraper ({len(test_urls)} concurrent scrapes)...") | |
| start_time = time.time() | |
| try: | |
| with ThreadPoolExecutor(max_workers=3) as executor: | |
| optimized_futures = [executor.submit(optimized_scraper, url) for url in test_urls] | |
| optimized_results = [future.result() for future in optimized_futures] | |
| optimized_concurrent_time = time.time() - start_time | |
| print(f"Optimized concurrent time: {optimized_concurrent_time:.2f} seconds") | |
| optimized_concurrent_success = True | |
| except Exception as e: | |
| print(f"Optimized concurrent scraping failed: {e}") | |
| optimized_concurrent_time = float('inf') | |
| optimized_concurrent_success = False | |
| if legacy_concurrent_success and optimized_concurrent_success: | |
| improvement = ((legacy_concurrent_time - optimized_concurrent_time) / legacy_concurrent_time) * 100 | |
| print(f"Concurrent performance improvement: {improvement:.1f}%") | |
| print(f"Concurrent speedup factor: {legacy_concurrent_time / optimized_concurrent_time:.1f}x") | |
| return legacy_concurrent_time, optimized_concurrent_time | |
| if __name__ == "__main__": | |
| print("🚀 Scraper Performance Comparison\n") | |
| try: | |
| # Single scrape comparison | |
| legacy_single, optimized_single = compare_single_scrape() | |
| # Repeated scrapes comparison | |
| legacy_repeated, optimized_repeated = compare_repeated_scrapes() | |
| # Concurrent scrapes comparison | |
| legacy_concurrent, optimized_concurrent = compare_concurrent_scrapes() | |
| print("\n" + "="*50) | |
| print("📊 PERFORMANCE SUMMARY") | |
| print("="*50) | |
| if legacy_single != float('inf') and optimized_single != float('inf'): | |
| single_improvement = ((legacy_single - optimized_single) / legacy_single) * 100 | |
| print(f"Single scrape improvement: {single_improvement:.1f}%") | |
| if legacy_repeated != float('inf') and optimized_repeated != float('inf'): | |
| repeated_improvement = ((legacy_repeated - optimized_repeated) / legacy_repeated) * 100 | |
| print(f"Repeated scrapes improvement: {repeated_improvement:.1f}%") | |
| if legacy_concurrent != float('inf') and optimized_concurrent != float('inf'): | |
| concurrent_improvement = ((legacy_concurrent - optimized_concurrent) / legacy_concurrent) * 100 | |
| print(f"Concurrent scrapes improvement: {concurrent_improvement:.1f}%") | |
| print("\n🎯 KEY OPTIMIZATIONS:") | |
| print("• Driver pooling eliminates repeated initialization overhead") | |
| print("• Smart waiting replaces fixed 2-second delays") | |
| print("• Bulk JavaScript operations for faster element extraction") | |
| print("• Performance-optimized Chrome flags") | |
| print("• Proper timeout handling prevents hanging") | |
| print("• Thread-safe concurrent processing") | |
| except Exception as e: | |
| print(f"Comparison failed: {e}") | |
| print("Make sure you have internet connection and all dependencies installed.") | |