File size: 4,078 Bytes
f2c46e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
"""
Performance test script for the optimized scraper
"""

import time
import asyncio
from clickloom_scrape import scraper
from concurrent.futures import ThreadPoolExecutor

def test_single_scrape():
    """Test single scrape performance"""
    print("Testing single scrape performance...")
    
    test_url = "https://httpbin.org/html"
    
    start_time = time.time()
    result = scraper(test_url)
    end_time = time.time()
    
    print(f"Single scrape took: {end_time - start_time:.2f} seconds")
    print(f"Page text length: {len(result.get('page_text', ''))}")
    print(f"Script sources found: {len(result.get('script_sources', []))}")
    print(f"Link sources found: {len(result.get('link_sources', []))}")
    
    if 'error' in result:
        print(f"Error occurred: {result['error']}")
    
    return end_time - start_time

def test_concurrent_scrapes():
    """Test concurrent scrape performance"""
    print("\nTesting concurrent scrape performance...")
    
    test_urls = [
        "https://httpbin.org/html",
        "https://httpbin.org/json",
        "https://httpbin.org/xml",
        "https://httpbin.org/robots.txt",
        "https://httpbin.org/status/200"
    ]
    
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = [executor.submit(scraper, url) for url in test_urls]
        results = [future.result() for future in futures]
    
    end_time = time.time()
    
    print(f"Concurrent scrapes ({len(test_urls)} URLs) took: {end_time - start_time:.2f} seconds")
    print(f"Average time per URL: {(end_time - start_time) / len(test_urls):.2f} seconds")
    
    for i, result in enumerate(results):
        if 'error' in result:
            print(f"URL {i+1} error: {result['error']}")
        else:
            print(f"URL {i+1}: {len(result.get('page_text', ''))} chars")
    
    return end_time - start_time

def test_repeated_scrapes():
    """Test repeated scrapes to show driver pooling benefits"""
    print("\nTesting repeated scrapes (driver pooling benefits)...")
    
    test_url = "https://httpbin.org/html"
    num_scrapes = 5
    
    times = []
    
    for i in range(num_scrapes):
        start_time = time.time()
        result = scraper(test_url)
        end_time = time.time()
        
        scrape_time = end_time - start_time
        times.append(scrape_time)
        
        print(f"Scrape {i+1}: {scrape_time:.2f} seconds")
        
        if 'error' in result:
            print(f"  Error: {result['error']}")
    
    avg_time = sum(times) / len(times)
    print(f"\nAverage time per scrape: {avg_time:.2f} seconds")
    print(f"First scrape: {times[0]:.2f} seconds (includes driver creation)")
    print(f"Subsequent scrapes avg: {sum(times[1:]) / len(times[1:]):.2f} seconds (reused drivers)")
    
    return avg_time

if __name__ == "__main__":
    print("=== Optimized Scraper Performance Test ===\n")
    
    try:
        # Test single scrape
        single_time = test_single_scrape()
        
        # Test concurrent scrapes
        concurrent_time = test_concurrent_scrapes()
        
        # Test repeated scrapes
        avg_time = test_repeated_scrapes()
        
        print("\n=== Performance Summary ===")
        print(f"Single scrape: {single_time:.2f} seconds")
        print(f"Concurrent scrapes: {concurrent_time:.2f} seconds")
        print(f"Average repeated scrape: {avg_time:.2f} seconds")
        
        print("\n=== Optimization Benefits ===")
        print("βœ“ Driver pooling reduces initialization overhead")
        print("βœ“ Smart waiting replaces fixed delays")
        print("βœ“ Bulk JavaScript operations for faster element extraction")
        print("βœ“ Performance-optimized Chrome options")
        print("βœ“ Proper timeout handling prevents hanging")
        print("βœ“ Thread-safe concurrent processing")
        
    except Exception as e:
        print(f"Test failed with error: {e}")
        print("Make sure you have internet connection and required dependencies installed.")