Spaces:

apexherbert200
/

selenium-scraper

No application file

App Files Files Community

selenium-scraper / test_performance.py

apexherbert200

First commit

f2c46e7 8 months ago

raw

history blame contribute delete

4.08 kB

	#!/usr/bin/env python3
	"""
	Performance test script for the optimized scraper
	"""

	import time
	import asyncio
	from clickloom_scrape import scraper
	from concurrent.futures import ThreadPoolExecutor

	def test_single_scrape():
	"""Test single scrape performance"""
	print("Testing single scrape performance...")

	test_url = "https://httpbin.org/html"

	start_time = time.time()
	result = scraper(test_url)
	end_time = time.time()

	print(f"Single scrape took: {end_time - start_time:.2f} seconds")
	print(f"Page text length: {len(result.get('page_text', ''))}")
	print(f"Script sources found: {len(result.get('script_sources', []))}")
	print(f"Link sources found: {len(result.get('link_sources', []))}")

	if 'error' in result:
	print(f"Error occurred: {result['error']}")

	return end_time - start_time

	def test_concurrent_scrapes():
	"""Test concurrent scrape performance"""
	print("\nTesting concurrent scrape performance...")

	test_urls = [
	"https://httpbin.org/html",
	"https://httpbin.org/json",
	"https://httpbin.org/xml",
	"https://httpbin.org/robots.txt",
	"https://httpbin.org/status/200"
	]

	start_time = time.time()

	with ThreadPoolExecutor(max_workers=3) as executor:
	futures = [executor.submit(scraper, url) for url in test_urls]
	results = [future.result() for future in futures]

	end_time = time.time()

	print(f"Concurrent scrapes ({len(test_urls)} URLs) took: {end_time - start_time:.2f} seconds")
	print(f"Average time per URL: {(end_time - start_time) / len(test_urls):.2f} seconds")

	for i, result in enumerate(results):
	if 'error' in result:
	print(f"URL {i+1} error: {result['error']}")
	else:
	print(f"URL {i+1}: {len(result.get('page_text', ''))} chars")

	return end_time - start_time

	def test_repeated_scrapes():
	"""Test repeated scrapes to show driver pooling benefits"""
	print("\nTesting repeated scrapes (driver pooling benefits)...")

	test_url = "https://httpbin.org/html"
	num_scrapes = 5

	times = []

	for i in range(num_scrapes):
	start_time = time.time()
	result = scraper(test_url)
	end_time = time.time()

	scrape_time = end_time - start_time
	times.append(scrape_time)

	print(f"Scrape {i+1}: {scrape_time:.2f} seconds")

	if 'error' in result:
	print(f" Error: {result['error']}")

	avg_time = sum(times) / len(times)
	print(f"\nAverage time per scrape: {avg_time:.2f} seconds")
	print(f"First scrape: {times[0]:.2f} seconds (includes driver creation)")
	print(f"Subsequent scrapes avg: {sum(times[1:]) / len(times[1:]):.2f} seconds (reused drivers)")

	return avg_time

	if __name__ == "__main__":
	print("=== Optimized Scraper Performance Test ===\n")

	try:
	# Test single scrape
	single_time = test_single_scrape()

	# Test concurrent scrapes
	concurrent_time = test_concurrent_scrapes()

	# Test repeated scrapes
	avg_time = test_repeated_scrapes()

	print("\n=== Performance Summary ===")
	print(f"Single scrape: {single_time:.2f} seconds")
	print(f"Concurrent scrapes: {concurrent_time:.2f} seconds")
	print(f"Average repeated scrape: {avg_time:.2f} seconds")

	print("\n=== Optimization Benefits ===")
	print("✓ Driver pooling reduces initialization overhead")
	print("✓ Smart waiting replaces fixed delays")
	print("✓ Bulk JavaScript operations for faster element extraction")
	print("✓ Performance-optimized Chrome options")
	print("✓ Proper timeout handling prevents hanging")
	print("✓ Thread-safe concurrent processing")

	except Exception as e:
	print(f"Test failed with error: {e}")
	print("Make sure you have internet connection and required dependencies installed.")