selenium-scraper / test_performance.py
apexherbert200's picture
First commit
f2c46e7
#!/usr/bin/env python3
"""
Performance test script for the optimized scraper
"""
import time
import asyncio
from clickloom_scrape import scraper
from concurrent.futures import ThreadPoolExecutor
def test_single_scrape():
"""Test single scrape performance"""
print("Testing single scrape performance...")
test_url = "https://httpbin.org/html"
start_time = time.time()
result = scraper(test_url)
end_time = time.time()
print(f"Single scrape took: {end_time - start_time:.2f} seconds")
print(f"Page text length: {len(result.get('page_text', ''))}")
print(f"Script sources found: {len(result.get('script_sources', []))}")
print(f"Link sources found: {len(result.get('link_sources', []))}")
if 'error' in result:
print(f"Error occurred: {result['error']}")
return end_time - start_time
def test_concurrent_scrapes():
"""Test concurrent scrape performance"""
print("\nTesting concurrent scrape performance...")
test_urls = [
"https://httpbin.org/html",
"https://httpbin.org/json",
"https://httpbin.org/xml",
"https://httpbin.org/robots.txt",
"https://httpbin.org/status/200"
]
start_time = time.time()
with ThreadPoolExecutor(max_workers=3) as executor:
futures = [executor.submit(scraper, url) for url in test_urls]
results = [future.result() for future in futures]
end_time = time.time()
print(f"Concurrent scrapes ({len(test_urls)} URLs) took: {end_time - start_time:.2f} seconds")
print(f"Average time per URL: {(end_time - start_time) / len(test_urls):.2f} seconds")
for i, result in enumerate(results):
if 'error' in result:
print(f"URL {i+1} error: {result['error']}")
else:
print(f"URL {i+1}: {len(result.get('page_text', ''))} chars")
return end_time - start_time
def test_repeated_scrapes():
"""Test repeated scrapes to show driver pooling benefits"""
print("\nTesting repeated scrapes (driver pooling benefits)...")
test_url = "https://httpbin.org/html"
num_scrapes = 5
times = []
for i in range(num_scrapes):
start_time = time.time()
result = scraper(test_url)
end_time = time.time()
scrape_time = end_time - start_time
times.append(scrape_time)
print(f"Scrape {i+1}: {scrape_time:.2f} seconds")
if 'error' in result:
print(f" Error: {result['error']}")
avg_time = sum(times) / len(times)
print(f"\nAverage time per scrape: {avg_time:.2f} seconds")
print(f"First scrape: {times[0]:.2f} seconds (includes driver creation)")
print(f"Subsequent scrapes avg: {sum(times[1:]) / len(times[1:]):.2f} seconds (reused drivers)")
return avg_time
if __name__ == "__main__":
print("=== Optimized Scraper Performance Test ===\n")
try:
# Test single scrape
single_time = test_single_scrape()
# Test concurrent scrapes
concurrent_time = test_concurrent_scrapes()
# Test repeated scrapes
avg_time = test_repeated_scrapes()
print("\n=== Performance Summary ===")
print(f"Single scrape: {single_time:.2f} seconds")
print(f"Concurrent scrapes: {concurrent_time:.2f} seconds")
print(f"Average repeated scrape: {avg_time:.2f} seconds")
print("\n=== Optimization Benefits ===")
print("βœ“ Driver pooling reduces initialization overhead")
print("βœ“ Smart waiting replaces fixed delays")
print("βœ“ Bulk JavaScript operations for faster element extraction")
print("βœ“ Performance-optimized Chrome options")
print("βœ“ Proper timeout handling prevents hanging")
print("βœ“ Thread-safe concurrent processing")
except Exception as e:
print(f"Test failed with error: {e}")
print("Make sure you have internet connection and required dependencies installed.")