Spaces:
No application file
No application file
| #!/usr/bin/env python3 | |
| """ | |
| Performance test script for the optimized scraper | |
| """ | |
| import time | |
| import asyncio | |
| from clickloom_scrape import scraper | |
| from concurrent.futures import ThreadPoolExecutor | |
| def test_single_scrape(): | |
| """Test single scrape performance""" | |
| print("Testing single scrape performance...") | |
| test_url = "https://httpbin.org/html" | |
| start_time = time.time() | |
| result = scraper(test_url) | |
| end_time = time.time() | |
| print(f"Single scrape took: {end_time - start_time:.2f} seconds") | |
| print(f"Page text length: {len(result.get('page_text', ''))}") | |
| print(f"Script sources found: {len(result.get('script_sources', []))}") | |
| print(f"Link sources found: {len(result.get('link_sources', []))}") | |
| if 'error' in result: | |
| print(f"Error occurred: {result['error']}") | |
| return end_time - start_time | |
| def test_concurrent_scrapes(): | |
| """Test concurrent scrape performance""" | |
| print("\nTesting concurrent scrape performance...") | |
| test_urls = [ | |
| "https://httpbin.org/html", | |
| "https://httpbin.org/json", | |
| "https://httpbin.org/xml", | |
| "https://httpbin.org/robots.txt", | |
| "https://httpbin.org/status/200" | |
| ] | |
| start_time = time.time() | |
| with ThreadPoolExecutor(max_workers=3) as executor: | |
| futures = [executor.submit(scraper, url) for url in test_urls] | |
| results = [future.result() for future in futures] | |
| end_time = time.time() | |
| print(f"Concurrent scrapes ({len(test_urls)} URLs) took: {end_time - start_time:.2f} seconds") | |
| print(f"Average time per URL: {(end_time - start_time) / len(test_urls):.2f} seconds") | |
| for i, result in enumerate(results): | |
| if 'error' in result: | |
| print(f"URL {i+1} error: {result['error']}") | |
| else: | |
| print(f"URL {i+1}: {len(result.get('page_text', ''))} chars") | |
| return end_time - start_time | |
| def test_repeated_scrapes(): | |
| """Test repeated scrapes to show driver pooling benefits""" | |
| print("\nTesting repeated scrapes (driver pooling benefits)...") | |
| test_url = "https://httpbin.org/html" | |
| num_scrapes = 5 | |
| times = [] | |
| for i in range(num_scrapes): | |
| start_time = time.time() | |
| result = scraper(test_url) | |
| end_time = time.time() | |
| scrape_time = end_time - start_time | |
| times.append(scrape_time) | |
| print(f"Scrape {i+1}: {scrape_time:.2f} seconds") | |
| if 'error' in result: | |
| print(f" Error: {result['error']}") | |
| avg_time = sum(times) / len(times) | |
| print(f"\nAverage time per scrape: {avg_time:.2f} seconds") | |
| print(f"First scrape: {times[0]:.2f} seconds (includes driver creation)") | |
| print(f"Subsequent scrapes avg: {sum(times[1:]) / len(times[1:]):.2f} seconds (reused drivers)") | |
| return avg_time | |
| if __name__ == "__main__": | |
| print("=== Optimized Scraper Performance Test ===\n") | |
| try: | |
| # Test single scrape | |
| single_time = test_single_scrape() | |
| # Test concurrent scrapes | |
| concurrent_time = test_concurrent_scrapes() | |
| # Test repeated scrapes | |
| avg_time = test_repeated_scrapes() | |
| print("\n=== Performance Summary ===") | |
| print(f"Single scrape: {single_time:.2f} seconds") | |
| print(f"Concurrent scrapes: {concurrent_time:.2f} seconds") | |
| print(f"Average repeated scrape: {avg_time:.2f} seconds") | |
| print("\n=== Optimization Benefits ===") | |
| print("β Driver pooling reduces initialization overhead") | |
| print("β Smart waiting replaces fixed delays") | |
| print("β Bulk JavaScript operations for faster element extraction") | |
| print("β Performance-optimized Chrome options") | |
| print("β Proper timeout handling prevents hanging") | |
| print("β Thread-safe concurrent processing") | |
| except Exception as e: | |
| print(f"Test failed with error: {e}") | |
| print("Make sure you have internet connection and required dependencies installed.") | |