selenium-scraper / quick_test.py
apexherbert200's picture
First commit
f2c46e7
#!/usr/bin/env python3
"""
Quick test to verify the optimized scraper works
"""
import time
from clickloom_scrape import scraper
def test_basic_functionality():
"""Test basic scraper functionality"""
print("πŸ§ͺ Testing basic scraper functionality...")
# Test with a simple URL
test_url = "https://httpbin.org/html"
start_time = time.time()
try:
result = scraper(test_url, timeout=10)
end_time = time.time()
print(f"βœ… Scraper completed in {end_time - start_time:.2f} seconds")
if 'error' in result:
print(f"❌ Error occurred: {result['error']}")
return False
print(f"πŸ“„ Page text length: {len(result.get('page_text', ''))} characters")
print(f"πŸ“œ Script sources found: {len(result.get('script_sources', []))}")
print(f"πŸ”— Link sources found: {len(result.get('link_sources', []))}")
# Check if we got some content
if len(result.get('page_text', '')) > 0:
print("βœ… Successfully extracted page content")
return True
else:
print("⚠️ No page content extracted")
return False
except Exception as e:
print(f"❌ Test failed with exception: {e}")
return False
def test_multiple_requests():
"""Test multiple requests to show driver pooling benefits"""
print("\nπŸ”„ Testing multiple requests (driver pooling)...")
test_urls = [
"https://httpbin.org/html",
"https://httpbin.org/json",
"https://httpbin.org/xml"
]
times = []
for i, url in enumerate(test_urls, 1):
print(f" Request {i}/{len(test_urls)}: {url}")
start_time = time.time()
try:
result = scraper(url, timeout=10)
end_time = time.time()
request_time = end_time - start_time
times.append(request_time)
if 'error' in result:
print(f" ❌ Error: {result['error']}")
else:
print(f" βœ… Completed in {request_time:.2f}s - {len(result.get('page_text', ''))} chars")
except Exception as e:
print(f" ❌ Exception: {e}")
times.append(float('inf'))
valid_times = [t for t in times if t != float('inf')]
if valid_times:
avg_time = sum(valid_times) / len(valid_times)
print(f"\nπŸ“Š Average request time: {avg_time:.2f} seconds")
if len(valid_times) > 1:
print(f"πŸ“ˆ First request: {valid_times[0]:.2f}s (includes driver creation)")
print(f"πŸ“ˆ Subsequent avg: {sum(valid_times[1:]) / len(valid_times[1:]):.2f}s (reused drivers)")
return True
else:
print("❌ All requests failed")
return False
if __name__ == "__main__":
print("πŸš€ Quick Test - Optimized Scraper\n")
# Test basic functionality
basic_test_passed = test_basic_functionality()
# Test multiple requests
multiple_test_passed = test_multiple_requests()
print("\n" + "="*50)
print("πŸ“‹ TEST SUMMARY")
print("="*50)
if basic_test_passed:
print("βœ… Basic functionality: PASSED")
else:
print("❌ Basic functionality: FAILED")
if multiple_test_passed:
print("βœ… Multiple requests: PASSED")
else:
print("❌ Multiple requests: FAILED")
if basic_test_passed and multiple_test_passed:
print("\nπŸŽ‰ All tests passed! The optimized scraper is working correctly.")
print("\nπŸš€ Key optimizations active:")
print(" β€’ Driver pooling for faster subsequent requests")
print(" β€’ Smart waiting instead of fixed delays")
print(" β€’ Performance-optimized Chrome options")
print(" β€’ Bulk JavaScript operations")
print(" β€’ Proper timeout handling")
else:
print("\n⚠️ Some tests failed. Check the error messages above.")
print("Make sure you have:")
print(" β€’ Internet connection")
print(" β€’ Chrome/Chromium browser installed")
print(" β€’ All required dependencies installed")