File size: 5,435 Bytes
675dcd6 8e956a0 675dcd6 7e7b0a9 c297293 7e7b0a9 c297293 7e7b0a9 675dcd6 c297293 7e7b0a9 c297293 7e7b0a9 c297293 675dcd6 c297293 7e7b0a9 c297293 7e7b0a9 c297293 7e7b0a9 c297293 675dcd6 c297293 7e7b0a9 c297293 7e7b0a9 c297293 7e7b0a9 c297293 7e7b0a9 c297293 675dcd6 f13aa68 c297293 675dcd6 c297293 84e3e1d c297293 7e7b0a9 c297293 84e3e1d c297293 84e3e1d c297293 84e3e1d c297293 ebb0ab7 c297293 ebb0ab7 c297293 ebb0ab7 c297293 ebb0ab7 7e7b0a9 c297293 7e7b0a9 84e3e1d c297293 84e3e1d 7e7b0a9 c297293 7e7b0a9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | import aiohttp
import logging
import random
import asyncio
from typing import Optional
# WebShare proxies
WEBSHARE_PROXIES = [
"198.23.239.134:6540:zvubytfw:ak6yit5k2tvj",
"207.244.217.165:6712:zvubytfw:ak6yit5k2tvj",
"107.172.163.27:6543:zvubytfw:ak6yit5k2tvj",
"161.123.152.115:6360:zvubytfw:ak6yit5k2tvj",
"23.94.138.75:6349:zvubytfw:ak6yit5k2tvj",
"216.10.27.159:6837:zvubytfw:ak6yit5k2tvj",
"136.0.207.84:6661:zvubytfw:ak6yit5k2tvj",
"64.64.118.149:6732:zvubytfw:ak6yit5k2tvj",
"142.147.128.93:6593:zvubytfw:ak6yit5k2tvj",
"154.36.110.199:6853:zvubytfw:ak6yit5k2tvj"
]
# Track proxy performance
proxy_failures = {}
current_proxy_idx = -1
logger = logging.getLogger(__name__)
def get_proxy_url() -> str:
"""Get a proxy URL using round-robin with failure detection"""
global current_proxy_idx
# Try all proxies if necessary
for _ in range(len(WEBSHARE_PROXIES)):
current_proxy_idx = (current_proxy_idx + 1) % len(WEBSHARE_PROXIES)
proxy_str = WEBSHARE_PROXIES[current_proxy_idx]
# Skip frequently failing proxies
if proxy_failures.get(proxy_str, 0) >= 3:
continue
# Format proxy for aiohttp
parts = proxy_str.split(':')
if len(parts) == 4:
ip, port, username, password = parts
return f"http://{username}:{password}@{ip}:{port}"
# If all proxies have failures, reset and try again
proxy_failures.clear()
return get_proxy_url()
def mark_proxy_failure(proxy_url: str) -> None:
"""Mark a proxy as failing"""
for proxy_str in WEBSHARE_PROXIES:
if proxy_str.split(':')[0] in proxy_url:
proxy_failures[proxy_str] = proxy_failures.get(proxy_str, 0) + 1
if proxy_failures[proxy_str] >= 3:
asyncio.create_task(reset_proxy_after_delay(proxy_str, 300))
break
async def reset_proxy_after_delay(proxy_str: str, delay: int) -> None:
"""Reset a proxy's failure count after a delay"""
await asyncio.sleep(delay)
if proxy_str in proxy_failures:
proxy_failures[proxy_str] = 0
async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
"""Fetch a page using proxies with retry logic"""
logger.info(f"Requesting URL: {url}")
# Enhanced headers
enhanced_headers = headers.copy()
enhanced_headers.update({
"Accept-Encoding": "gzip, deflate, br", # Explicitly specify brotli support
"Cache-Control": "no-cache",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Referer": "https://www.google.com/",
})
# Try up to 3 proxies
for attempt in range(3):
proxy_url = get_proxy_url()
logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
try:
# Use a longer timeout for proxies
async with session.get(
url,
headers=enhanced_headers,
proxy=proxy_url,
timeout=25,
ssl=False,
allow_redirects=True
) as response:
if response.status in [200, 202]:
content = await response.text()
# Check if we got actual content, not a bot detection page
if len(content) > 5000 and ("<html" in content or "<!DOCTYPE" in content):
# Check for property content on search pages
if "searchresults" in url or "search" in url:
if "property-card" in content or "sr_property_block" in content or "sr_item" in content:
logger.info(f"Successfully retrieved search results ({len(content)} bytes)")
return content
else:
logger.warning("No property cards found in search results")
mark_proxy_failure(proxy_url)
else:
logger.info(f"Successfully retrieved content ({len(content)} bytes)")
return content
else:
logger.warning(f"Response too short or not HTML: {len(content)} bytes")
mark_proxy_failure(proxy_url)
else:
logger.warning(f"Response status {response.status} from proxy {proxy_url}")
mark_proxy_failure(proxy_url)
except Exception as e:
logger.error(f"Proxy request failed: {str(e)}")
mark_proxy_failure(proxy_url)
# Wait before next attempt
await asyncio.sleep(2)
# If proxies failed, try direct connection as last resort
logger.warning("All proxies failed, trying direct connection")
try:
async with session.get(
url,
headers=enhanced_headers,
timeout=15
) as response:
if response.status == 200:
content = await response.text()
if len(content) > 5000:
return content
except Exception as e:
logger.error(f"Direct request also failed: {str(e)}")
return None |