File size: 5,932 Bytes
675dcd6
8e956a0
675dcd6
7e7b0a9
 
675dcd6
7e7b0a9
 
 
 
8e956a0
 
 
7e7b0a9
 
 
 
 
 
 
 
 
 
 
 
 
675dcd6
7e7b0a9
 
 
 
 
 
 
 
 
 
 
675dcd6
7e7b0a9
 
 
 
 
 
675dcd6
7e7b0a9
 
 
 
 
 
 
 
 
 
 
 
 
675dcd6
7e7b0a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675dcd6
f13aa68
7e7b0a9
675dcd6
 
7e7b0a9
 
 
 
675dcd6
 
 
 
7e7b0a9
 
675dcd6
7e7b0a9
 
 
675dcd6
7e7b0a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999991a
7e7b0a9
 
 
 
 
 
 
 
 
 
999991a
7e7b0a9
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import aiohttp
import logging
import random
import asyncio
from typing import Optional, List
import time
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

logger = logging.getLogger(__name__)

# WebShare proxies list (format: IP:PORT:USERNAME:PASSWORD)
WEBSHARE_PROXIES = [
    "198.23.239.134:6540:zvubytfw:ak6yit5k2tvj",
    "207.244.217.165:6712:zvubytfw:ak6yit5k2tvj",
    "107.172.163.27:6543:zvubytfw:ak6yit5k2tvj",
    "161.123.152.115:6360:zvubytfw:ak6yit5k2tvj",
    "23.94.138.75:6349:zvubytfw:ak6yit5k2tvj",
    "216.10.27.159:6837:zvubytfw:ak6yit5k2tvj",
    "136.0.207.84:6661:zvubytfw:ak6yit5k2tvj",
    "64.64.118.149:6732:zvubytfw:ak6yit5k2tvj",
    "142.147.128.93:6593:zvubytfw:ak6yit5k2tvj",
    "154.36.110.199:6853:zvubytfw:ak6yit5k2tvj"
]

# Track proxy usage and failures
proxy_usage_count = {}
proxy_failure_count = {}
last_proxy_index = -1

def format_proxy_url(proxy_str: str) -> str:
    """Convert proxy string to proxy URL format"""
    parts = proxy_str.split(':')
    if len(parts) != 4:
        logger.error(f"Invalid proxy format: {proxy_str}")
        return None
    
    ip, port, username, password = parts
    return f"http://{username}:{password}@{ip}:{port}"

def get_next_proxy() -> str:
    """Get the next proxy using a round-robin approach with failure consideration"""
    global last_proxy_index
    
    # Simple round-robin selection with failure skipping
    for _ in range(len(WEBSHARE_PROXIES)):
        last_proxy_index = (last_proxy_index + 1) % len(WEBSHARE_PROXIES)
        proxy_str = WEBSHARE_PROXIES[last_proxy_index]
        
        # Skip proxies with too many recent failures
        if proxy_failure_count.get(proxy_str, 0) >= 3:
            continue
            
        # Track usage
        proxy_usage_count[proxy_str] = proxy_usage_count.get(proxy_str, 0) + 1
        
        return format_proxy_url(proxy_str)
    
    # If all proxies have failures, reset failure counts and try again
    proxy_failure_count.clear()
    return get_next_proxy()

def mark_proxy_failure(proxy_url: str):
    """Mark a proxy as having a failure"""
    # Extract the original proxy string from the URL
    for proxy_str in WEBSHARE_PROXIES:
        if proxy_str.split(':')[0] in proxy_url and proxy_str.split(':')[2] in proxy_url:
            proxy_failure_count[proxy_str] = proxy_failure_count.get(proxy_str, 0) + 1
            logger.warning(f"Marked proxy as failed: {proxy_url} (failure count: {proxy_failure_count[proxy_str]})")
            
            # Reset failure count after 10 minutes to give proxy a second chance
            if proxy_failure_count[proxy_str] >= 3:
                logger.warning(f"Proxy {proxy_url} has failed multiple times, cooling down")
                asyncio.create_task(reset_proxy_failure(proxy_str, 600))  # 10 minutes cooldown
            break

async def reset_proxy_failure(proxy_str: str, delay: int):
    """Reset the failure count for a proxy after a delay"""
    await asyncio.sleep(delay)
    if proxy_str in proxy_failure_count:
        proxy_failure_count[proxy_str] = 0
        logger.info(f"Reset failure count for proxy: {proxy_str}")

async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
    """Fetch a page using WebShare proxies with retry logic"""
    logger.info(f"Requesting URL: {url}")
    
    # Enhanced headers that look more like a browser
    enhanced_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Cache-Control": "max-age=0",
        "TE": "Trailers",
        "Referer": "https://www.google.com/"
    }
    # Update with any custom headers provided
    enhanced_headers.update(headers)
    
    # Try up to 3 different proxies
    max_proxy_attempts = 3
    
    for attempt in range(max_proxy_attempts):
        proxy_url = get_next_proxy()
        if not proxy_url:
            logger.error("Failed to get a valid proxy")
            return None
            
        logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
        
        try:
            # Try with this proxy
            async with session.get(
                url,
                headers=enhanced_headers,
                proxy=proxy_url,
                timeout=30,
                allow_redirects=True
            ) as response:
                if response.status == 200:
                    content = await response.text()
                    
                    # Verify we got actual content (common anti-bot techniques return empty pages)
                    if len(content) > 1000 and ("<html" in content or "<!DOCTYPE" in content):
                        logger.info(f"Successfully retrieved content ({len(content)} bytes)")
                        return content
                    else:
                        logger.warning(f"Response too small or not HTML: {len(content)} bytes")
                        mark_proxy_failure(proxy_url)
                else:
                    logger.warning(f"Response status code: {response.status}")
                    mark_proxy_failure(proxy_url)
                    
        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
            logger.error(f"Request failed with proxy {proxy_url}: {str(e)}")
            mark_proxy_failure(proxy_url)
        except Exception as e:
            logger.error(f"Unexpected error: {str(e)}")
            mark_proxy_failure(proxy_url)
        
        # Wait before trying next proxy
        await asyncio.sleep(1)
    
    logger.error("All proxy attempts failed")
    return None