#!/usr/bin/env python3 """ Performance test script for the optimized scraper """ import time import asyncio from clickloom_scrape import scraper from concurrent.futures import ThreadPoolExecutor def test_single_scrape(): """Test single scrape performance""" print("Testing single scrape performance...") test_url = "https://httpbin.org/html" start_time = time.time() result = scraper(test_url) end_time = time.time() print(f"Single scrape took: {end_time - start_time:.2f} seconds") print(f"Page text length: {len(result.get('page_text', ''))}") print(f"Script sources found: {len(result.get('script_sources', []))}") print(f"Link sources found: {len(result.get('link_sources', []))}") if 'error' in result: print(f"Error occurred: {result['error']}") return end_time - start_time def test_concurrent_scrapes(): """Test concurrent scrape performance""" print("\nTesting concurrent scrape performance...") test_urls = [ "https://httpbin.org/html", "https://httpbin.org/json", "https://httpbin.org/xml", "https://httpbin.org/robots.txt", "https://httpbin.org/status/200" ] start_time = time.time() with ThreadPoolExecutor(max_workers=3) as executor: futures = [executor.submit(scraper, url) for url in test_urls] results = [future.result() for future in futures] end_time = time.time() print(f"Concurrent scrapes ({len(test_urls)} URLs) took: {end_time - start_time:.2f} seconds") print(f"Average time per URL: {(end_time - start_time) / len(test_urls):.2f} seconds") for i, result in enumerate(results): if 'error' in result: print(f"URL {i+1} error: {result['error']}") else: print(f"URL {i+1}: {len(result.get('page_text', ''))} chars") return end_time - start_time def test_repeated_scrapes(): """Test repeated scrapes to show driver pooling benefits""" print("\nTesting repeated scrapes (driver pooling benefits)...") test_url = "https://httpbin.org/html" num_scrapes = 5 times = [] for i in range(num_scrapes): start_time = time.time() result = scraper(test_url) end_time = time.time() scrape_time = end_time - start_time times.append(scrape_time) print(f"Scrape {i+1}: {scrape_time:.2f} seconds") if 'error' in result: print(f" Error: {result['error']}") avg_time = sum(times) / len(times) print(f"\nAverage time per scrape: {avg_time:.2f} seconds") print(f"First scrape: {times[0]:.2f} seconds (includes driver creation)") print(f"Subsequent scrapes avg: {sum(times[1:]) / len(times[1:]):.2f} seconds (reused drivers)") return avg_time if __name__ == "__main__": print("=== Optimized Scraper Performance Test ===\n") try: # Test single scrape single_time = test_single_scrape() # Test concurrent scrapes concurrent_time = test_concurrent_scrapes() # Test repeated scrapes avg_time = test_repeated_scrapes() print("\n=== Performance Summary ===") print(f"Single scrape: {single_time:.2f} seconds") print(f"Concurrent scrapes: {concurrent_time:.2f} seconds") print(f"Average repeated scrape: {avg_time:.2f} seconds") print("\n=== Optimization Benefits ===") print("✓ Driver pooling reduces initialization overhead") print("✓ Smart waiting replaces fixed delays") print("✓ Bulk JavaScript operations for faster element extraction") print("✓ Performance-optimized Chrome options") print("✓ Proper timeout handling prevents hanging") print("✓ Thread-safe concurrent processing") except Exception as e: print(f"Test failed with error: {e}") print("Make sure you have internet connection and required dependencies installed.")