File size: 8,190 Bytes
f2c46e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
"""
Performance comparison between legacy and optimized scrapers
"""

import time
from concurrent.futures import ThreadPoolExecutor
from clickloom_scrape import scraper as optimized_scraper
from legacy_scraper import legacy_scraper

def time_function(func, *args, **kwargs):
    """Time a function execution"""
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    return result, end_time - start_time

def compare_single_scrape():
    """Compare single scrape performance"""
    print("=== Single Scrape Comparison ===")
    
    test_url = "https://httpbin.org/html"
    
    # Test legacy scraper
    print("Testing legacy scraper...")
    try:
        legacy_result, legacy_time = time_function(legacy_scraper, test_url)
        print(f"Legacy scraper: {legacy_time:.2f} seconds")
        legacy_success = True
    except Exception as e:
        print(f"Legacy scraper failed: {e}")
        legacy_time = float('inf')
        legacy_success = False
    
    # Test optimized scraper
    print("Testing optimized scraper...")
    try:
        optimized_result, optimized_time = time_function(optimized_scraper, test_url)
        print(f"Optimized scraper: {optimized_time:.2f} seconds")
        optimized_success = True
    except Exception as e:
        print(f"Optimized scraper failed: {e}")
        optimized_time = float('inf')
        optimized_success = False
    
    if legacy_success and optimized_success:
        improvement = ((legacy_time - optimized_time) / legacy_time) * 100
        print(f"Performance improvement: {improvement:.1f}%")
        
        if optimized_time < legacy_time:
            print(f"Optimized scraper is {legacy_time / optimized_time:.1f}x faster")
        else:
            print(f"Legacy scraper is {optimized_time / legacy_time:.1f}x faster")
    
    return legacy_time, optimized_time

def compare_repeated_scrapes():
    """Compare repeated scrapes to show driver pooling benefits"""
    print("\n=== Repeated Scrapes Comparison ===")
    
    test_url = "https://httpbin.org/html"
    num_scrapes = 3
    
    # Test legacy scraper (creates new driver each time)
    print(f"Testing legacy scraper ({num_scrapes} scrapes)...")
    legacy_times = []
    
    for i in range(num_scrapes):
        try:
            _, scrape_time = time_function(legacy_scraper, test_url)
            legacy_times.append(scrape_time)
            print(f"  Scrape {i+1}: {scrape_time:.2f} seconds")
        except Exception as e:
            print(f"  Scrape {i+1} failed: {e}")
            legacy_times.append(float('inf'))
    
    legacy_total = sum(t for t in legacy_times if t != float('inf'))
    legacy_avg = legacy_total / len([t for t in legacy_times if t != float('inf')]) if legacy_times else 0
    
    # Test optimized scraper (reuses drivers)
    print(f"Testing optimized scraper ({num_scrapes} scrapes)...")
    optimized_times = []
    
    for i in range(num_scrapes):
        try:
            _, scrape_time = time_function(optimized_scraper, test_url)
            optimized_times.append(scrape_time)
            print(f"  Scrape {i+1}: {scrape_time:.2f} seconds")
        except Exception as e:
            print(f"  Scrape {i+1} failed: {e}")
            optimized_times.append(float('inf'))
    
    optimized_total = sum(t for t in optimized_times if t != float('inf'))
    optimized_avg = optimized_total / len([t for t in optimized_times if t != float('inf')]) if optimized_times else 0
    
    print(f"\nLegacy total time: {legacy_total:.2f} seconds (avg: {legacy_avg:.2f}s)")
    print(f"Optimized total time: {optimized_total:.2f} seconds (avg: {optimized_avg:.2f}s)")
    
    if legacy_total > 0 and optimized_total > 0:
        improvement = ((legacy_total - optimized_total) / legacy_total) * 100
        print(f"Total time improvement: {improvement:.1f}%")
        print(f"Speedup factor: {legacy_total / optimized_total:.1f}x")
    
    return legacy_total, optimized_total

def compare_concurrent_scrapes():
    """Compare concurrent scraping performance"""
    print("\n=== Concurrent Scrapes Comparison ===")
    
    test_urls = [
        "https://httpbin.org/html",
        "https://httpbin.org/json",
        "https://httpbin.org/xml"
    ]
    
    # Test legacy scraper concurrently
    print(f"Testing legacy scraper ({len(test_urls)} concurrent scrapes)...")
    start_time = time.time()
    
    try:
        with ThreadPoolExecutor(max_workers=3) as executor:
            legacy_futures = [executor.submit(legacy_scraper, url) for url in test_urls]
            legacy_results = [future.result() for future in legacy_futures]
        legacy_concurrent_time = time.time() - start_time
        print(f"Legacy concurrent time: {legacy_concurrent_time:.2f} seconds")
        legacy_concurrent_success = True
    except Exception as e:
        print(f"Legacy concurrent scraping failed: {e}")
        legacy_concurrent_time = float('inf')
        legacy_concurrent_success = False
    
    # Test optimized scraper concurrently
    print(f"Testing optimized scraper ({len(test_urls)} concurrent scrapes)...")
    start_time = time.time()
    
    try:
        with ThreadPoolExecutor(max_workers=3) as executor:
            optimized_futures = [executor.submit(optimized_scraper, url) for url in test_urls]
            optimized_results = [future.result() for future in optimized_futures]
        optimized_concurrent_time = time.time() - start_time
        print(f"Optimized concurrent time: {optimized_concurrent_time:.2f} seconds")
        optimized_concurrent_success = True
    except Exception as e:
        print(f"Optimized concurrent scraping failed: {e}")
        optimized_concurrent_time = float('inf')
        optimized_concurrent_success = False
    
    if legacy_concurrent_success and optimized_concurrent_success:
        improvement = ((legacy_concurrent_time - optimized_concurrent_time) / legacy_concurrent_time) * 100
        print(f"Concurrent performance improvement: {improvement:.1f}%")
        print(f"Concurrent speedup factor: {legacy_concurrent_time / optimized_concurrent_time:.1f}x")
    
    return legacy_concurrent_time, optimized_concurrent_time

if __name__ == "__main__":
    print("🚀 Scraper Performance Comparison\n")
    
    try:
        # Single scrape comparison
        legacy_single, optimized_single = compare_single_scrape()
        
        # Repeated scrapes comparison
        legacy_repeated, optimized_repeated = compare_repeated_scrapes()
        
        # Concurrent scrapes comparison
        legacy_concurrent, optimized_concurrent = compare_concurrent_scrapes()
        
        print("\n" + "="*50)
        print("📊 PERFORMANCE SUMMARY")
        print("="*50)
        
        if legacy_single != float('inf') and optimized_single != float('inf'):
            single_improvement = ((legacy_single - optimized_single) / legacy_single) * 100
            print(f"Single scrape improvement: {single_improvement:.1f}%")
        
        if legacy_repeated != float('inf') and optimized_repeated != float('inf'):
            repeated_improvement = ((legacy_repeated - optimized_repeated) / legacy_repeated) * 100
            print(f"Repeated scrapes improvement: {repeated_improvement:.1f}%")
        
        if legacy_concurrent != float('inf') and optimized_concurrent != float('inf'):
            concurrent_improvement = ((legacy_concurrent - optimized_concurrent) / legacy_concurrent) * 100
            print(f"Concurrent scrapes improvement: {concurrent_improvement:.1f}%")
        
        print("\n🎯 KEY OPTIMIZATIONS:")
        print("• Driver pooling eliminates repeated initialization overhead")
        print("• Smart waiting replaces fixed 2-second delays")
        print("• Bulk JavaScript operations for faster element extraction")
        print("• Performance-optimized Chrome flags")
        print("• Proper timeout handling prevents hanging")
        print("• Thread-safe concurrent processing")
        
    except Exception as e:
        print(f"Comparison failed: {e}")
        print("Make sure you have internet connection and all dependencies installed.")