File size: 4,271 Bytes
f2c46e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
"""
Quick test to verify the optimized scraper works
"""

import time
from clickloom_scrape import scraper

def test_basic_functionality():
    """Test basic scraper functionality"""
    print("πŸ§ͺ Testing basic scraper functionality...")
    
    # Test with a simple URL
    test_url = "https://httpbin.org/html"
    
    start_time = time.time()
    try:
        result = scraper(test_url, timeout=10)
        end_time = time.time()
        
        print(f"βœ… Scraper completed in {end_time - start_time:.2f} seconds")
        
        if 'error' in result:
            print(f"❌ Error occurred: {result['error']}")
            return False
        
        print(f"πŸ“„ Page text length: {len(result.get('page_text', ''))} characters")
        print(f"πŸ“œ Script sources found: {len(result.get('script_sources', []))}")
        print(f"πŸ”— Link sources found: {len(result.get('link_sources', []))}")
        
        # Check if we got some content
        if len(result.get('page_text', '')) > 0:
            print("βœ… Successfully extracted page content")
            return True
        else:
            print("⚠️ No page content extracted")
            return False
            
    except Exception as e:
        print(f"❌ Test failed with exception: {e}")
        return False

def test_multiple_requests():
    """Test multiple requests to show driver pooling benefits"""
    print("\nπŸ”„ Testing multiple requests (driver pooling)...")
    
    test_urls = [
        "https://httpbin.org/html",
        "https://httpbin.org/json",
        "https://httpbin.org/xml"
    ]
    
    times = []
    
    for i, url in enumerate(test_urls, 1):
        print(f"  Request {i}/{len(test_urls)}: {url}")
        
        start_time = time.time()
        try:
            result = scraper(url, timeout=10)
            end_time = time.time()
            
            request_time = end_time - start_time
            times.append(request_time)
            
            if 'error' in result:
                print(f"    ❌ Error: {result['error']}")
            else:
                print(f"    βœ… Completed in {request_time:.2f}s - {len(result.get('page_text', ''))} chars")
                
        except Exception as e:
            print(f"    ❌ Exception: {e}")
            times.append(float('inf'))
    
    valid_times = [t for t in times if t != float('inf')]
    if valid_times:
        avg_time = sum(valid_times) / len(valid_times)
        print(f"\nπŸ“Š Average request time: {avg_time:.2f} seconds")
        
        if len(valid_times) > 1:
            print(f"πŸ“ˆ First request: {valid_times[0]:.2f}s (includes driver creation)")
            print(f"πŸ“ˆ Subsequent avg: {sum(valid_times[1:]) / len(valid_times[1:]):.2f}s (reused drivers)")
        
        return True
    else:
        print("❌ All requests failed")
        return False

if __name__ == "__main__":
    print("πŸš€ Quick Test - Optimized Scraper\n")
    
    # Test basic functionality
    basic_test_passed = test_basic_functionality()
    
    # Test multiple requests
    multiple_test_passed = test_multiple_requests()
    
    print("\n" + "="*50)
    print("πŸ“‹ TEST SUMMARY")
    print("="*50)
    
    if basic_test_passed:
        print("βœ… Basic functionality: PASSED")
    else:
        print("❌ Basic functionality: FAILED")
    
    if multiple_test_passed:
        print("βœ… Multiple requests: PASSED")
    else:
        print("❌ Multiple requests: FAILED")
    
    if basic_test_passed and multiple_test_passed:
        print("\nπŸŽ‰ All tests passed! The optimized scraper is working correctly.")
        print("\nπŸš€ Key optimizations active:")
        print("  β€’ Driver pooling for faster subsequent requests")
        print("  β€’ Smart waiting instead of fixed delays")
        print("  β€’ Performance-optimized Chrome options")
        print("  β€’ Bulk JavaScript operations")
        print("  β€’ Proper timeout handling")
    else:
        print("\n⚠️ Some tests failed. Check the error messages above.")
        print("Make sure you have:")
        print("  β€’ Internet connection")
        print("  β€’ Chrome/Chromium browser installed")
        print("  β€’ All required dependencies installed")