garvitcpp commited on
Commit
84e3e1d
·
verified ·
1 Parent(s): 6a75974

Update services/utils/http_utils.py

Browse files
Files changed (1) hide show
  1. services/utils/http_utils.py +28 -56
services/utils/http_utils.py CHANGED
@@ -5,7 +5,6 @@ import asyncio
5
  from typing import Optional, List
6
  import time
7
  from dotenv import load_dotenv
8
- from aiohttp_socks import ProxyConnector # Better proxy connector
9
 
10
  # Load environment variables
11
  load_dotenv()
@@ -81,58 +80,14 @@ async def reset_proxy_failure(proxy_str: str, delay: int):
81
  proxy_failure_count[proxy_str] = 0
82
  logger.info(f"Reset failure count for proxy: {proxy_str}")
83
 
84
- async def fetch_with_session(session: aiohttp.ClientSession, url: str, proxy_url: str, headers: dict) -> Optional[str]:
85
- """Make a request with the given session and proxy"""
86
- try:
87
- async with session.get(
88
- url,
89
- headers=headers,
90
- proxy=proxy_url,
91
- timeout=30,
92
- allow_redirects=True
93
- ) as response:
94
- # Accept both 200 and 202 status codes
95
- if response.status in (200, 202):
96
- content = await response.text()
97
-
98
- # Verify we got actual content
99
- if len(content) > 1000 and ("<html" in content or "<!DOCTYPE" in content):
100
- logger.info(f"Successfully retrieved content ({len(content)} bytes)")
101
- return content
102
- else:
103
- logger.warning(f"Response too small or not HTML: {len(content)} bytes")
104
- return None
105
- else:
106
- logger.warning(f"Response status code: {response.status}")
107
- return None
108
- except Exception as e:
109
- logger.error(f"Request failed: {str(e)}")
110
- return None
111
-
112
  async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
113
  """Fetch a page using WebShare proxies with retry logic"""
114
  logger.info(f"Requesting URL: {url}")
115
 
116
- # Enhanced headers that look more like a browser
117
- enhanced_headers = {
118
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
119
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
120
- "Accept-Language": "en-US,en;q=0.9",
121
- # Don't specify Accept-Encoding to let aiohttp handle it with brotli installed
122
- "Connection": "keep-alive",
123
- "Upgrade-Insecure-Requests": "1",
124
- "Sec-Fetch-Dest": "document",
125
- "Sec-Fetch-Mode": "navigate",
126
- "Sec-Fetch-Site": "none",
127
- "Sec-Fetch-User": "?1",
128
- "Cache-Control": "max-age=0",
129
- "Referer": "https://www.google.com/"
130
- }
131
- # Update with any custom headers provided
132
- enhanced_headers.update(headers)
133
-
134
  # Try up to 3 different proxies
135
- for attempt in range(3):
 
 
136
  proxy_url = get_next_proxy()
137
  if not proxy_url:
138
  logger.error("Failed to get a valid proxy")
@@ -141,20 +96,37 @@ async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) ->
141
  logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
142
 
143
  try:
144
- # Create a new session for each proxy to avoid connection issues
145
- connector = ProxyConnector.from_url(proxy_url)
146
- async with aiohttp.ClientSession(connector=connector) as proxy_session:
147
- content = await fetch_with_session(proxy_session, url, None, enhanced_headers)
148
- if content:
149
- return content
 
 
 
 
 
 
 
 
 
 
 
 
150
  else:
 
151
  mark_proxy_failure(proxy_url)
152
- except Exception as e:
 
153
  logger.error(f"Request failed with proxy {proxy_url}: {str(e)}")
154
  mark_proxy_failure(proxy_url)
 
 
 
155
 
156
  # Wait before trying next proxy
157
- await asyncio.sleep(2)
158
 
159
  logger.error("All proxy attempts failed")
160
  return None
 
5
  from typing import Optional, List
6
  import time
7
  from dotenv import load_dotenv
 
8
 
9
  # Load environment variables
10
  load_dotenv()
 
80
  proxy_failure_count[proxy_str] = 0
81
  logger.info(f"Reset failure count for proxy: {proxy_str}")
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
84
  """Fetch a page using WebShare proxies with retry logic"""
85
  logger.info(f"Requesting URL: {url}")
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  # Try up to 3 different proxies
88
+ max_proxy_attempts = 3
89
+
90
+ for attempt in range(max_proxy_attempts):
91
  proxy_url = get_next_proxy()
92
  if not proxy_url:
93
  logger.error("Failed to get a valid proxy")
 
96
  logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
97
 
98
  try:
99
+ # Try with this proxy
100
+ async with session.get(
101
+ url,
102
+ headers=headers,
103
+ proxy=proxy_url,
104
+ timeout=30,
105
+ ssl=False
106
+ ) as response:
107
+ if response.status in [200, 202]:
108
+ content = await response.text()
109
+
110
+ # Verify we got actual content (common anti-bot techniques return empty pages)
111
+ if len(content) > 1000 and ("<html" in content or "<!DOCTYPE" in content):
112
+ logger.info(f"Successfully retrieved content ({len(content)} bytes)")
113
+ return content
114
+ else:
115
+ logger.warning(f"Response too small or not HTML: {len(content)} bytes")
116
+ mark_proxy_failure(proxy_url)
117
  else:
118
+ logger.warning(f"Response status code: {response.status}")
119
  mark_proxy_failure(proxy_url)
120
+
121
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
122
  logger.error(f"Request failed with proxy {proxy_url}: {str(e)}")
123
  mark_proxy_failure(proxy_url)
124
+ except Exception as e:
125
+ logger.error(f"Unexpected error: {str(e)}")
126
+ mark_proxy_failure(proxy_url)
127
 
128
  # Wait before trying next proxy
129
+ await asyncio.sleep(1)
130
 
131
  logger.error("All proxy attempts failed")
132
  return None