File size: 5,435 Bytes
675dcd6
8e956a0
675dcd6
7e7b0a9
c297293
7e7b0a9
c297293
7e7b0a9
 
 
 
 
 
 
 
 
 
 
 
675dcd6
c297293
 
 
7e7b0a9
c297293
7e7b0a9
c297293
 
 
675dcd6
c297293
7e7b0a9
c297293
 
7e7b0a9
c297293
 
7e7b0a9
 
c297293
 
 
 
 
675dcd6
c297293
 
 
7e7b0a9
c297293
 
7e7b0a9
c297293
 
 
 
7e7b0a9
 
c297293
 
7e7b0a9
c297293
 
675dcd6
f13aa68
c297293
675dcd6
 
c297293
 
 
 
 
 
 
 
 
 
 
 
84e3e1d
c297293
 
 
7e7b0a9
 
 
c297293
84e3e1d
 
c297293
84e3e1d
c297293
 
 
84e3e1d
 
 
 
c297293
 
ebb0ab7
c297293
ebb0ab7
c297293
 
 
 
ebb0ab7
c297293
 
 
ebb0ab7
 
 
7e7b0a9
c297293
7e7b0a9
84e3e1d
c297293
84e3e1d
7e7b0a9
c297293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e7b0a9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import aiohttp
import logging
import random
import asyncio
from typing import Optional

# WebShare proxies
WEBSHARE_PROXIES = [
    "198.23.239.134:6540:zvubytfw:ak6yit5k2tvj",
    "207.244.217.165:6712:zvubytfw:ak6yit5k2tvj",
    "107.172.163.27:6543:zvubytfw:ak6yit5k2tvj",
    "161.123.152.115:6360:zvubytfw:ak6yit5k2tvj",
    "23.94.138.75:6349:zvubytfw:ak6yit5k2tvj",
    "216.10.27.159:6837:zvubytfw:ak6yit5k2tvj",
    "136.0.207.84:6661:zvubytfw:ak6yit5k2tvj",
    "64.64.118.149:6732:zvubytfw:ak6yit5k2tvj",
    "142.147.128.93:6593:zvubytfw:ak6yit5k2tvj",
    "154.36.110.199:6853:zvubytfw:ak6yit5k2tvj"
]

# Track proxy performance
proxy_failures = {}
current_proxy_idx = -1

logger = logging.getLogger(__name__)

def get_proxy_url() -> str:
    """Get a proxy URL using round-robin with failure detection"""
    global current_proxy_idx
    
    # Try all proxies if necessary
    for _ in range(len(WEBSHARE_PROXIES)):
        current_proxy_idx = (current_proxy_idx + 1) % len(WEBSHARE_PROXIES)
        proxy_str = WEBSHARE_PROXIES[current_proxy_idx]
        
        # Skip frequently failing proxies
        if proxy_failures.get(proxy_str, 0) >= 3:
            continue
            
        # Format proxy for aiohttp
        parts = proxy_str.split(':')
        if len(parts) == 4:
            ip, port, username, password = parts
            return f"http://{username}:{password}@{ip}:{port}"
    
    # If all proxies have failures, reset and try again
    proxy_failures.clear()
    return get_proxy_url()

def mark_proxy_failure(proxy_url: str) -> None:
    """Mark a proxy as failing"""
    for proxy_str in WEBSHARE_PROXIES:
        if proxy_str.split(':')[0] in proxy_url:
            proxy_failures[proxy_str] = proxy_failures.get(proxy_str, 0) + 1
            if proxy_failures[proxy_str] >= 3:
                asyncio.create_task(reset_proxy_after_delay(proxy_str, 300))
            break

async def reset_proxy_after_delay(proxy_str: str, delay: int) -> None:
    """Reset a proxy's failure count after a delay"""
    await asyncio.sleep(delay)
    if proxy_str in proxy_failures:
        proxy_failures[proxy_str] = 0

async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
    """Fetch a page using proxies with retry logic"""
    logger.info(f"Requesting URL: {url}")
    
    # Enhanced headers
    enhanced_headers = headers.copy()
    enhanced_headers.update({
        "Accept-Encoding": "gzip, deflate, br",  # Explicitly specify brotli support
        "Cache-Control": "no-cache",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Referer": "https://www.google.com/",
    })
    
    # Try up to 3 proxies
    for attempt in range(3):
        proxy_url = get_proxy_url()
        logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
        
        try:
            # Use a longer timeout for proxies
            async with session.get(
                url,
                headers=enhanced_headers,
                proxy=proxy_url,
                timeout=25,
                ssl=False,
                allow_redirects=True
            ) as response:
                if response.status in [200, 202]:
                    content = await response.text()
                    
                    # Check if we got actual content, not a bot detection page
                    if len(content) > 5000 and ("<html" in content or "<!DOCTYPE" in content):
                        # Check for property content on search pages
                        if "searchresults" in url or "search" in url:
                            if "property-card" in content or "sr_property_block" in content or "sr_item" in content:
                                logger.info(f"Successfully retrieved search results ({len(content)} bytes)")
                                return content
                            else:
                                logger.warning("No property cards found in search results")
                                mark_proxy_failure(proxy_url)
                        else:
                            logger.info(f"Successfully retrieved content ({len(content)} bytes)")
                            return content
                    else:
                        logger.warning(f"Response too short or not HTML: {len(content)} bytes")
                        mark_proxy_failure(proxy_url)
                else:
                    logger.warning(f"Response status {response.status} from proxy {proxy_url}")
                    mark_proxy_failure(proxy_url)
        except Exception as e:
            logger.error(f"Proxy request failed: {str(e)}")
            mark_proxy_failure(proxy_url)
        
        # Wait before next attempt
        await asyncio.sleep(2)
    
    # If proxies failed, try direct connection as last resort
    logger.warning("All proxies failed, trying direct connection")
    try:
        async with session.get(
            url,
            headers=enhanced_headers,
            timeout=15
        ) as response:
            if response.status == 200:
                content = await response.text()
                if len(content) > 5000:
                    return content
    except Exception as e:
        logger.error(f"Direct request also failed: {str(e)}")
    
    return None