File size: 5,932 Bytes
675dcd6 8e956a0 675dcd6 7e7b0a9 675dcd6 7e7b0a9 8e956a0 7e7b0a9 675dcd6 7e7b0a9 675dcd6 7e7b0a9 675dcd6 7e7b0a9 675dcd6 7e7b0a9 675dcd6 f13aa68 7e7b0a9 675dcd6 7e7b0a9 675dcd6 7e7b0a9 675dcd6 7e7b0a9 675dcd6 7e7b0a9 999991a 7e7b0a9 999991a 7e7b0a9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | import aiohttp
import logging
import random
import asyncio
from typing import Optional, List
import time
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
logger = logging.getLogger(__name__)
# WebShare proxies list (format: IP:PORT:USERNAME:PASSWORD)
WEBSHARE_PROXIES = [
"198.23.239.134:6540:zvubytfw:ak6yit5k2tvj",
"207.244.217.165:6712:zvubytfw:ak6yit5k2tvj",
"107.172.163.27:6543:zvubytfw:ak6yit5k2tvj",
"161.123.152.115:6360:zvubytfw:ak6yit5k2tvj",
"23.94.138.75:6349:zvubytfw:ak6yit5k2tvj",
"216.10.27.159:6837:zvubytfw:ak6yit5k2tvj",
"136.0.207.84:6661:zvubytfw:ak6yit5k2tvj",
"64.64.118.149:6732:zvubytfw:ak6yit5k2tvj",
"142.147.128.93:6593:zvubytfw:ak6yit5k2tvj",
"154.36.110.199:6853:zvubytfw:ak6yit5k2tvj"
]
# Track proxy usage and failures
proxy_usage_count = {}
proxy_failure_count = {}
last_proxy_index = -1
def format_proxy_url(proxy_str: str) -> str:
"""Convert proxy string to proxy URL format"""
parts = proxy_str.split(':')
if len(parts) != 4:
logger.error(f"Invalid proxy format: {proxy_str}")
return None
ip, port, username, password = parts
return f"http://{username}:{password}@{ip}:{port}"
def get_next_proxy() -> str:
"""Get the next proxy using a round-robin approach with failure consideration"""
global last_proxy_index
# Simple round-robin selection with failure skipping
for _ in range(len(WEBSHARE_PROXIES)):
last_proxy_index = (last_proxy_index + 1) % len(WEBSHARE_PROXIES)
proxy_str = WEBSHARE_PROXIES[last_proxy_index]
# Skip proxies with too many recent failures
if proxy_failure_count.get(proxy_str, 0) >= 3:
continue
# Track usage
proxy_usage_count[proxy_str] = proxy_usage_count.get(proxy_str, 0) + 1
return format_proxy_url(proxy_str)
# If all proxies have failures, reset failure counts and try again
proxy_failure_count.clear()
return get_next_proxy()
def mark_proxy_failure(proxy_url: str):
"""Mark a proxy as having a failure"""
# Extract the original proxy string from the URL
for proxy_str in WEBSHARE_PROXIES:
if proxy_str.split(':')[0] in proxy_url and proxy_str.split(':')[2] in proxy_url:
proxy_failure_count[proxy_str] = proxy_failure_count.get(proxy_str, 0) + 1
logger.warning(f"Marked proxy as failed: {proxy_url} (failure count: {proxy_failure_count[proxy_str]})")
# Reset failure count after 10 minutes to give proxy a second chance
if proxy_failure_count[proxy_str] >= 3:
logger.warning(f"Proxy {proxy_url} has failed multiple times, cooling down")
asyncio.create_task(reset_proxy_failure(proxy_str, 600)) # 10 minutes cooldown
break
async def reset_proxy_failure(proxy_str: str, delay: int):
"""Reset the failure count for a proxy after a delay"""
await asyncio.sleep(delay)
if proxy_str in proxy_failure_count:
proxy_failure_count[proxy_str] = 0
logger.info(f"Reset failure count for proxy: {proxy_str}")
async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
"""Fetch a page using WebShare proxies with retry logic"""
logger.info(f"Requesting URL: {url}")
# Enhanced headers that look more like a browser
enhanced_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "max-age=0",
"TE": "Trailers",
"Referer": "https://www.google.com/"
}
# Update with any custom headers provided
enhanced_headers.update(headers)
# Try up to 3 different proxies
max_proxy_attempts = 3
for attempt in range(max_proxy_attempts):
proxy_url = get_next_proxy()
if not proxy_url:
logger.error("Failed to get a valid proxy")
return None
logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
try:
# Try with this proxy
async with session.get(
url,
headers=enhanced_headers,
proxy=proxy_url,
timeout=30,
allow_redirects=True
) as response:
if response.status == 200:
content = await response.text()
# Verify we got actual content (common anti-bot techniques return empty pages)
if len(content) > 1000 and ("<html" in content or "<!DOCTYPE" in content):
logger.info(f"Successfully retrieved content ({len(content)} bytes)")
return content
else:
logger.warning(f"Response too small or not HTML: {len(content)} bytes")
mark_proxy_failure(proxy_url)
else:
logger.warning(f"Response status code: {response.status}")
mark_proxy_failure(proxy_url)
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
logger.error(f"Request failed with proxy {proxy_url}: {str(e)}")
mark_proxy_failure(proxy_url)
except Exception as e:
logger.error(f"Unexpected error: {str(e)}")
mark_proxy_failure(proxy_url)
# Wait before trying next proxy
await asyncio.sleep(1)
logger.error("All proxy attempts failed")
return None |