Spaces:
No application file
No application file
| #!/usr/bin/env python3 | |
| """ | |
| Quick test to verify the optimized scraper works | |
| """ | |
| import time | |
| from clickloom_scrape import scraper | |
| def test_basic_functionality(): | |
| """Test basic scraper functionality""" | |
| print("π§ͺ Testing basic scraper functionality...") | |
| # Test with a simple URL | |
| test_url = "https://httpbin.org/html" | |
| start_time = time.time() | |
| try: | |
| result = scraper(test_url, timeout=10) | |
| end_time = time.time() | |
| print(f"β Scraper completed in {end_time - start_time:.2f} seconds") | |
| if 'error' in result: | |
| print(f"β Error occurred: {result['error']}") | |
| return False | |
| print(f"π Page text length: {len(result.get('page_text', ''))} characters") | |
| print(f"π Script sources found: {len(result.get('script_sources', []))}") | |
| print(f"π Link sources found: {len(result.get('link_sources', []))}") | |
| # Check if we got some content | |
| if len(result.get('page_text', '')) > 0: | |
| print("β Successfully extracted page content") | |
| return True | |
| else: | |
| print("β οΈ No page content extracted") | |
| return False | |
| except Exception as e: | |
| print(f"β Test failed with exception: {e}") | |
| return False | |
| def test_multiple_requests(): | |
| """Test multiple requests to show driver pooling benefits""" | |
| print("\nπ Testing multiple requests (driver pooling)...") | |
| test_urls = [ | |
| "https://httpbin.org/html", | |
| "https://httpbin.org/json", | |
| "https://httpbin.org/xml" | |
| ] | |
| times = [] | |
| for i, url in enumerate(test_urls, 1): | |
| print(f" Request {i}/{len(test_urls)}: {url}") | |
| start_time = time.time() | |
| try: | |
| result = scraper(url, timeout=10) | |
| end_time = time.time() | |
| request_time = end_time - start_time | |
| times.append(request_time) | |
| if 'error' in result: | |
| print(f" β Error: {result['error']}") | |
| else: | |
| print(f" β Completed in {request_time:.2f}s - {len(result.get('page_text', ''))} chars") | |
| except Exception as e: | |
| print(f" β Exception: {e}") | |
| times.append(float('inf')) | |
| valid_times = [t for t in times if t != float('inf')] | |
| if valid_times: | |
| avg_time = sum(valid_times) / len(valid_times) | |
| print(f"\nπ Average request time: {avg_time:.2f} seconds") | |
| if len(valid_times) > 1: | |
| print(f"π First request: {valid_times[0]:.2f}s (includes driver creation)") | |
| print(f"π Subsequent avg: {sum(valid_times[1:]) / len(valid_times[1:]):.2f}s (reused drivers)") | |
| return True | |
| else: | |
| print("β All requests failed") | |
| return False | |
| if __name__ == "__main__": | |
| print("π Quick Test - Optimized Scraper\n") | |
| # Test basic functionality | |
| basic_test_passed = test_basic_functionality() | |
| # Test multiple requests | |
| multiple_test_passed = test_multiple_requests() | |
| print("\n" + "="*50) | |
| print("π TEST SUMMARY") | |
| print("="*50) | |
| if basic_test_passed: | |
| print("β Basic functionality: PASSED") | |
| else: | |
| print("β Basic functionality: FAILED") | |
| if multiple_test_passed: | |
| print("β Multiple requests: PASSED") | |
| else: | |
| print("β Multiple requests: FAILED") | |
| if basic_test_passed and multiple_test_passed: | |
| print("\nπ All tests passed! The optimized scraper is working correctly.") | |
| print("\nπ Key optimizations active:") | |
| print(" β’ Driver pooling for faster subsequent requests") | |
| print(" β’ Smart waiting instead of fixed delays") | |
| print(" β’ Performance-optimized Chrome options") | |
| print(" β’ Bulk JavaScript operations") | |
| print(" β’ Proper timeout handling") | |
| else: | |
| print("\nβ οΈ Some tests failed. Check the error messages above.") | |
| print("Make sure you have:") | |
| print(" β’ Internet connection") | |
| print(" β’ Chrome/Chromium browser installed") | |
| print(" β’ All required dependencies installed") | |