garvitcpp commited on
Commit
f349f41
·
verified ·
1 Parent(s): 612285a

Update services/utils/http_utils.py

Browse files
Files changed (1) hide show
  1. services/utils/http_utils.py +6 -131
services/utils/http_utils.py CHANGED
@@ -1,141 +1,16 @@
1
  import aiohttp
2
  import logging
3
  import random
4
- import asyncio
5
  from typing import Optional
6
 
7
- # WebShare proxies
8
- WEBSHARE_PROXIES = [
9
- "198.23.239.134:6540:zvubytfw:ak6yit5k2tvj",
10
- "207.244.217.165:6712:zvubytfw:ak6yit5k2tvj",
11
- "107.172.163.27:6543:zvubytfw:ak6yit5k2tvj",
12
- "161.123.152.115:6360:zvubytfw:ak6yit5k2tvj",
13
- "23.94.138.75:6349:zvubytfw:ak6yit5k2tvj",
14
- "216.10.27.159:6837:zvubytfw:ak6yit5k2tvj",
15
- "136.0.207.84:6661:zvubytfw:ak6yit5k2tvj",
16
- "64.64.118.149:6732:zvubytfw:ak6yit5k2tvj",
17
- "142.147.128.93:6593:zvubytfw:ak6yit5k2tvj",
18
- "154.36.110.199:6853:zvubytfw:ak6yit5k2tvj"
19
- ]
20
-
21
- # Track proxy performance
22
- proxy_failures = {}
23
- current_proxy_idx = -1
24
 
25
  logger = logging.getLogger(__name__)
26
 
27
- def get_proxy_url() -> str:
28
- """Get a proxy URL using round-robin with failure detection"""
29
- global current_proxy_idx
30
-
31
- # Try all proxies if necessary
32
- for _ in range(len(WEBSHARE_PROXIES)):
33
- current_proxy_idx = (current_proxy_idx + 1) % len(WEBSHARE_PROXIES)
34
- proxy_str = WEBSHARE_PROXIES[current_proxy_idx]
35
-
36
- # Skip frequently failing proxies
37
- if proxy_failures.get(proxy_str, 0) >= 3:
38
- continue
39
-
40
- # Format proxy for aiohttp
41
- parts = proxy_str.split(':')
42
- if len(parts) == 4:
43
- ip, port, username, password = parts
44
- return f"http://{username}:{password}@{ip}:{port}"
45
-
46
- # If all proxies have failures, reset and try again
47
- proxy_failures.clear()
48
- return get_proxy_url()
49
-
50
- def mark_proxy_failure(proxy_url: str) -> None:
51
- """Mark a proxy as failing"""
52
- for proxy_str in WEBSHARE_PROXIES:
53
- if proxy_str.split(':')[0] in proxy_url:
54
- proxy_failures[proxy_str] = proxy_failures.get(proxy_str, 0) + 1
55
- if proxy_failures[proxy_str] >= 3:
56
- asyncio.create_task(reset_proxy_after_delay(proxy_str, 300))
57
- break
58
-
59
- async def reset_proxy_after_delay(proxy_str: str, delay: int) -> None:
60
- """Reset a proxy's failure count after a delay"""
61
- await asyncio.sleep(delay)
62
- if proxy_str in proxy_failures:
63
- proxy_failures[proxy_str] = 0
64
-
65
  async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
66
- """Fetch a page using proxies with retry logic"""
67
- logger.info(f"Requesting URL: {url}")
68
-
69
- # Enhanced headers
70
- enhanced_headers = headers.copy()
71
- enhanced_headers.update({
72
- "Accept-Encoding": "gzip, deflate, br", # Explicitly specify brotli support
73
- "Cache-Control": "no-cache",
74
- "Pragma": "no-cache",
75
- "Sec-Fetch-Dest": "document",
76
- "Sec-Fetch-Mode": "navigate",
77
- "Sec-Fetch-Site": "none",
78
- "Sec-Fetch-User": "?1",
79
- "Referer": "https://www.google.com/",
80
- })
81
-
82
- # Try up to 3 proxies
83
- for attempt in range(3):
84
- proxy_url = get_proxy_url()
85
- logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
86
-
87
- try:
88
- # Use a longer timeout for proxies
89
- async with session.get(
90
- url,
91
- headers=enhanced_headers,
92
- proxy=proxy_url,
93
- timeout=25,
94
- ssl=False,
95
- allow_redirects=True
96
- ) as response:
97
- if response.status in [200, 202]:
98
- content = await response.text()
99
-
100
- # Check if we got actual content, not a bot detection page
101
- if len(content) > 5000 and ("<html" in content or "<!DOCTYPE" in content):
102
- # Check for property content on search pages
103
- if "searchresults" in url or "search" in url:
104
- if "property-card" in content or "sr_property_block" in content or "sr_item" in content:
105
- logger.info(f"Successfully retrieved search results ({len(content)} bytes)")
106
- return content
107
- else:
108
- logger.warning("No property cards found in search results")
109
- mark_proxy_failure(proxy_url)
110
- else:
111
- logger.info(f"Successfully retrieved content ({len(content)} bytes)")
112
- return content
113
- else:
114
- logger.warning(f"Response too short or not HTML: {len(content)} bytes")
115
- mark_proxy_failure(proxy_url)
116
- else:
117
- logger.warning(f"Response status {response.status} from proxy {proxy_url}")
118
- mark_proxy_failure(proxy_url)
119
- except Exception as e:
120
- logger.error(f"Proxy request failed: {str(e)}")
121
- mark_proxy_failure(proxy_url)
122
-
123
- # Wait before next attempt
124
- await asyncio.sleep(2)
125
-
126
- # If proxies failed, try direct connection as last resort
127
- logger.warning("All proxies failed, trying direct connection")
128
- try:
129
- async with session.get(
130
- url,
131
- headers=enhanced_headers,
132
- timeout=15
133
- ) as response:
134
- if response.status == 200:
135
- content = await response.text()
136
- if len(content) > 5000:
137
- return content
138
- except Exception as e:
139
- logger.error(f"Direct request also failed: {str(e)}")
140
 
141
- return None
 
 
1
  import aiohttp
2
  import logging
3
  import random
 
4
  from typing import Optional
5
 
6
+ from .browser_utils import fetch_page_with_browser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  logger = logging.getLogger(__name__)
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
11
+ """Fetch a page using browser automation with proxies"""
12
+ # Get user agent from headers
13
+ user_agent = headers.get('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # Use the browser-based approach
16
+ return await fetch_page_with_browser(url, user_agent)