#!/usr/bin/env python3 """ Quick test to verify the optimized scraper works """ import time from clickloom_scrape import scraper def test_basic_functionality(): """Test basic scraper functionality""" print("๐Ÿงช Testing basic scraper functionality...") # Test with a simple URL test_url = "https://httpbin.org/html" start_time = time.time() try: result = scraper(test_url, timeout=10) end_time = time.time() print(f"โœ… Scraper completed in {end_time - start_time:.2f} seconds") if 'error' in result: print(f"โŒ Error occurred: {result['error']}") return False print(f"๐Ÿ“„ Page text length: {len(result.get('page_text', ''))} characters") print(f"๐Ÿ“œ Script sources found: {len(result.get('script_sources', []))}") print(f"๐Ÿ”— Link sources found: {len(result.get('link_sources', []))}") # Check if we got some content if len(result.get('page_text', '')) > 0: print("โœ… Successfully extracted page content") return True else: print("โš ๏ธ No page content extracted") return False except Exception as e: print(f"โŒ Test failed with exception: {e}") return False def test_multiple_requests(): """Test multiple requests to show driver pooling benefits""" print("\n๐Ÿ”„ Testing multiple requests (driver pooling)...") test_urls = [ "https://httpbin.org/html", "https://httpbin.org/json", "https://httpbin.org/xml" ] times = [] for i, url in enumerate(test_urls, 1): print(f" Request {i}/{len(test_urls)}: {url}") start_time = time.time() try: result = scraper(url, timeout=10) end_time = time.time() request_time = end_time - start_time times.append(request_time) if 'error' in result: print(f" โŒ Error: {result['error']}") else: print(f" โœ… Completed in {request_time:.2f}s - {len(result.get('page_text', ''))} chars") except Exception as e: print(f" โŒ Exception: {e}") times.append(float('inf')) valid_times = [t for t in times if t != float('inf')] if valid_times: avg_time = sum(valid_times) / len(valid_times) print(f"\n๐Ÿ“Š Average request time: {avg_time:.2f} seconds") if len(valid_times) > 1: print(f"๐Ÿ“ˆ First request: {valid_times[0]:.2f}s (includes driver creation)") print(f"๐Ÿ“ˆ Subsequent avg: {sum(valid_times[1:]) / len(valid_times[1:]):.2f}s (reused drivers)") return True else: print("โŒ All requests failed") return False if __name__ == "__main__": print("๐Ÿš€ Quick Test - Optimized Scraper\n") # Test basic functionality basic_test_passed = test_basic_functionality() # Test multiple requests multiple_test_passed = test_multiple_requests() print("\n" + "="*50) print("๐Ÿ“‹ TEST SUMMARY") print("="*50) if basic_test_passed: print("โœ… Basic functionality: PASSED") else: print("โŒ Basic functionality: FAILED") if multiple_test_passed: print("โœ… Multiple requests: PASSED") else: print("โŒ Multiple requests: FAILED") if basic_test_passed and multiple_test_passed: print("\n๐ŸŽ‰ All tests passed! The optimized scraper is working correctly.") print("\n๐Ÿš€ Key optimizations active:") print(" โ€ข Driver pooling for faster subsequent requests") print(" โ€ข Smart waiting instead of fixed delays") print(" โ€ข Performance-optimized Chrome options") print(" โ€ข Bulk JavaScript operations") print(" โ€ข Proper timeout handling") else: print("\nโš ๏ธ Some tests failed. Check the error messages above.") print("Make sure you have:") print(" โ€ข Internet connection") print(" โ€ข Chrome/Chromium browser installed") print(" โ€ข All required dependencies installed")