Spaces:
No application file
No application file
File size: 8,190 Bytes
f2c46e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
#!/usr/bin/env python3
"""
Performance comparison between legacy and optimized scrapers
"""
import time
from concurrent.futures import ThreadPoolExecutor
from clickloom_scrape import scraper as optimized_scraper
from legacy_scraper import legacy_scraper
def time_function(func, *args, **kwargs):
"""Time a function execution"""
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
return result, end_time - start_time
def compare_single_scrape():
"""Compare single scrape performance"""
print("=== Single Scrape Comparison ===")
test_url = "https://httpbin.org/html"
# Test legacy scraper
print("Testing legacy scraper...")
try:
legacy_result, legacy_time = time_function(legacy_scraper, test_url)
print(f"Legacy scraper: {legacy_time:.2f} seconds")
legacy_success = True
except Exception as e:
print(f"Legacy scraper failed: {e}")
legacy_time = float('inf')
legacy_success = False
# Test optimized scraper
print("Testing optimized scraper...")
try:
optimized_result, optimized_time = time_function(optimized_scraper, test_url)
print(f"Optimized scraper: {optimized_time:.2f} seconds")
optimized_success = True
except Exception as e:
print(f"Optimized scraper failed: {e}")
optimized_time = float('inf')
optimized_success = False
if legacy_success and optimized_success:
improvement = ((legacy_time - optimized_time) / legacy_time) * 100
print(f"Performance improvement: {improvement:.1f}%")
if optimized_time < legacy_time:
print(f"Optimized scraper is {legacy_time / optimized_time:.1f}x faster")
else:
print(f"Legacy scraper is {optimized_time / legacy_time:.1f}x faster")
return legacy_time, optimized_time
def compare_repeated_scrapes():
"""Compare repeated scrapes to show driver pooling benefits"""
print("\n=== Repeated Scrapes Comparison ===")
test_url = "https://httpbin.org/html"
num_scrapes = 3
# Test legacy scraper (creates new driver each time)
print(f"Testing legacy scraper ({num_scrapes} scrapes)...")
legacy_times = []
for i in range(num_scrapes):
try:
_, scrape_time = time_function(legacy_scraper, test_url)
legacy_times.append(scrape_time)
print(f" Scrape {i+1}: {scrape_time:.2f} seconds")
except Exception as e:
print(f" Scrape {i+1} failed: {e}")
legacy_times.append(float('inf'))
legacy_total = sum(t for t in legacy_times if t != float('inf'))
legacy_avg = legacy_total / len([t for t in legacy_times if t != float('inf')]) if legacy_times else 0
# Test optimized scraper (reuses drivers)
print(f"Testing optimized scraper ({num_scrapes} scrapes)...")
optimized_times = []
for i in range(num_scrapes):
try:
_, scrape_time = time_function(optimized_scraper, test_url)
optimized_times.append(scrape_time)
print(f" Scrape {i+1}: {scrape_time:.2f} seconds")
except Exception as e:
print(f" Scrape {i+1} failed: {e}")
optimized_times.append(float('inf'))
optimized_total = sum(t for t in optimized_times if t != float('inf'))
optimized_avg = optimized_total / len([t for t in optimized_times if t != float('inf')]) if optimized_times else 0
print(f"\nLegacy total time: {legacy_total:.2f} seconds (avg: {legacy_avg:.2f}s)")
print(f"Optimized total time: {optimized_total:.2f} seconds (avg: {optimized_avg:.2f}s)")
if legacy_total > 0 and optimized_total > 0:
improvement = ((legacy_total - optimized_total) / legacy_total) * 100
print(f"Total time improvement: {improvement:.1f}%")
print(f"Speedup factor: {legacy_total / optimized_total:.1f}x")
return legacy_total, optimized_total
def compare_concurrent_scrapes():
"""Compare concurrent scraping performance"""
print("\n=== Concurrent Scrapes Comparison ===")
test_urls = [
"https://httpbin.org/html",
"https://httpbin.org/json",
"https://httpbin.org/xml"
]
# Test legacy scraper concurrently
print(f"Testing legacy scraper ({len(test_urls)} concurrent scrapes)...")
start_time = time.time()
try:
with ThreadPoolExecutor(max_workers=3) as executor:
legacy_futures = [executor.submit(legacy_scraper, url) for url in test_urls]
legacy_results = [future.result() for future in legacy_futures]
legacy_concurrent_time = time.time() - start_time
print(f"Legacy concurrent time: {legacy_concurrent_time:.2f} seconds")
legacy_concurrent_success = True
except Exception as e:
print(f"Legacy concurrent scraping failed: {e}")
legacy_concurrent_time = float('inf')
legacy_concurrent_success = False
# Test optimized scraper concurrently
print(f"Testing optimized scraper ({len(test_urls)} concurrent scrapes)...")
start_time = time.time()
try:
with ThreadPoolExecutor(max_workers=3) as executor:
optimized_futures = [executor.submit(optimized_scraper, url) for url in test_urls]
optimized_results = [future.result() for future in optimized_futures]
optimized_concurrent_time = time.time() - start_time
print(f"Optimized concurrent time: {optimized_concurrent_time:.2f} seconds")
optimized_concurrent_success = True
except Exception as e:
print(f"Optimized concurrent scraping failed: {e}")
optimized_concurrent_time = float('inf')
optimized_concurrent_success = False
if legacy_concurrent_success and optimized_concurrent_success:
improvement = ((legacy_concurrent_time - optimized_concurrent_time) / legacy_concurrent_time) * 100
print(f"Concurrent performance improvement: {improvement:.1f}%")
print(f"Concurrent speedup factor: {legacy_concurrent_time / optimized_concurrent_time:.1f}x")
return legacy_concurrent_time, optimized_concurrent_time
if __name__ == "__main__":
print("🚀 Scraper Performance Comparison\n")
try:
# Single scrape comparison
legacy_single, optimized_single = compare_single_scrape()
# Repeated scrapes comparison
legacy_repeated, optimized_repeated = compare_repeated_scrapes()
# Concurrent scrapes comparison
legacy_concurrent, optimized_concurrent = compare_concurrent_scrapes()
print("\n" + "="*50)
print("📊 PERFORMANCE SUMMARY")
print("="*50)
if legacy_single != float('inf') and optimized_single != float('inf'):
single_improvement = ((legacy_single - optimized_single) / legacy_single) * 100
print(f"Single scrape improvement: {single_improvement:.1f}%")
if legacy_repeated != float('inf') and optimized_repeated != float('inf'):
repeated_improvement = ((legacy_repeated - optimized_repeated) / legacy_repeated) * 100
print(f"Repeated scrapes improvement: {repeated_improvement:.1f}%")
if legacy_concurrent != float('inf') and optimized_concurrent != float('inf'):
concurrent_improvement = ((legacy_concurrent - optimized_concurrent) / legacy_concurrent) * 100
print(f"Concurrent scrapes improvement: {concurrent_improvement:.1f}%")
print("\n🎯 KEY OPTIMIZATIONS:")
print("• Driver pooling eliminates repeated initialization overhead")
print("• Smart waiting replaces fixed 2-second delays")
print("• Bulk JavaScript operations for faster element extraction")
print("• Performance-optimized Chrome flags")
print("• Proper timeout handling prevents hanging")
print("• Thread-safe concurrent processing")
except Exception as e:
print(f"Comparison failed: {e}")
print("Make sure you have internet connection and all dependencies installed.")
|