Spaces:
No application file
No application file
File size: 4,271 Bytes
f2c46e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
#!/usr/bin/env python3
"""
Quick test to verify the optimized scraper works
"""
import time
from clickloom_scrape import scraper
def test_basic_functionality():
"""Test basic scraper functionality"""
print("π§ͺ Testing basic scraper functionality...")
# Test with a simple URL
test_url = "https://httpbin.org/html"
start_time = time.time()
try:
result = scraper(test_url, timeout=10)
end_time = time.time()
print(f"β
Scraper completed in {end_time - start_time:.2f} seconds")
if 'error' in result:
print(f"β Error occurred: {result['error']}")
return False
print(f"π Page text length: {len(result.get('page_text', ''))} characters")
print(f"π Script sources found: {len(result.get('script_sources', []))}")
print(f"π Link sources found: {len(result.get('link_sources', []))}")
# Check if we got some content
if len(result.get('page_text', '')) > 0:
print("β
Successfully extracted page content")
return True
else:
print("β οΈ No page content extracted")
return False
except Exception as e:
print(f"β Test failed with exception: {e}")
return False
def test_multiple_requests():
"""Test multiple requests to show driver pooling benefits"""
print("\nπ Testing multiple requests (driver pooling)...")
test_urls = [
"https://httpbin.org/html",
"https://httpbin.org/json",
"https://httpbin.org/xml"
]
times = []
for i, url in enumerate(test_urls, 1):
print(f" Request {i}/{len(test_urls)}: {url}")
start_time = time.time()
try:
result = scraper(url, timeout=10)
end_time = time.time()
request_time = end_time - start_time
times.append(request_time)
if 'error' in result:
print(f" β Error: {result['error']}")
else:
print(f" β
Completed in {request_time:.2f}s - {len(result.get('page_text', ''))} chars")
except Exception as e:
print(f" β Exception: {e}")
times.append(float('inf'))
valid_times = [t for t in times if t != float('inf')]
if valid_times:
avg_time = sum(valid_times) / len(valid_times)
print(f"\nπ Average request time: {avg_time:.2f} seconds")
if len(valid_times) > 1:
print(f"π First request: {valid_times[0]:.2f}s (includes driver creation)")
print(f"π Subsequent avg: {sum(valid_times[1:]) / len(valid_times[1:]):.2f}s (reused drivers)")
return True
else:
print("β All requests failed")
return False
if __name__ == "__main__":
print("π Quick Test - Optimized Scraper\n")
# Test basic functionality
basic_test_passed = test_basic_functionality()
# Test multiple requests
multiple_test_passed = test_multiple_requests()
print("\n" + "="*50)
print("π TEST SUMMARY")
print("="*50)
if basic_test_passed:
print("β
Basic functionality: PASSED")
else:
print("β Basic functionality: FAILED")
if multiple_test_passed:
print("β
Multiple requests: PASSED")
else:
print("β Multiple requests: FAILED")
if basic_test_passed and multiple_test_passed:
print("\nπ All tests passed! The optimized scraper is working correctly.")
print("\nπ Key optimizations active:")
print(" β’ Driver pooling for faster subsequent requests")
print(" β’ Smart waiting instead of fixed delays")
print(" β’ Performance-optimized Chrome options")
print(" β’ Bulk JavaScript operations")
print(" β’ Proper timeout handling")
else:
print("\nβ οΈ Some tests failed. Check the error messages above.")
print("Make sure you have:")
print(" β’ Internet connection")
print(" β’ Chrome/Chromium browser installed")
print(" β’ All required dependencies installed")
|