garvitcpp commited on
Commit
6a75974
·
verified ·
1 Parent(s): 1bca05a

Update services/utils/http_utils.py

Browse files
Files changed (1) hide show
  1. services/utils/http_utils.py +46 -36
services/utils/http_utils.py CHANGED
@@ -5,6 +5,7 @@ import asyncio
5
  from typing import Optional, List
6
  import time
7
  from dotenv import load_dotenv
 
8
 
9
  # Load environment variables
10
  load_dotenv()
@@ -26,7 +27,6 @@ WEBSHARE_PROXIES = [
26
  ]
27
 
28
  # Track proxy usage and failures
29
- proxy_usage_count = {}
30
  proxy_failure_count = {}
31
  last_proxy_index = -1
32
 
@@ -53,13 +53,11 @@ def get_next_proxy() -> str:
53
  if proxy_failure_count.get(proxy_str, 0) >= 3:
54
  continue
55
 
56
- # Track usage
57
- proxy_usage_count[proxy_str] = proxy_usage_count.get(proxy_str, 0) + 1
58
-
59
  return format_proxy_url(proxy_str)
60
 
61
  # If all proxies have failures, reset failure counts and try again
62
  proxy_failure_count.clear()
 
63
  return get_next_proxy()
64
 
65
  def mark_proxy_failure(proxy_url: str):
@@ -70,10 +68,10 @@ def mark_proxy_failure(proxy_url: str):
70
  proxy_failure_count[proxy_str] = proxy_failure_count.get(proxy_str, 0) + 1
71
  logger.warning(f"Marked proxy as failed: {proxy_url} (failure count: {proxy_failure_count[proxy_str]})")
72
 
73
- # Reset failure count after 10 minutes to give proxy a second chance
74
  if proxy_failure_count[proxy_str] >= 3:
75
  logger.warning(f"Proxy {proxy_url} has failed multiple times, cooling down")
76
- asyncio.create_task(reset_proxy_failure(proxy_str, 600)) # 10 minutes cooldown
77
  break
78
 
79
  async def reset_proxy_failure(proxy_str: str, delay: int):
@@ -83,6 +81,34 @@ async def reset_proxy_failure(proxy_str: str, delay: int):
83
  proxy_failure_count[proxy_str] = 0
84
  logger.info(f"Reset failure count for proxy: {proxy_str}")
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
87
  """Fetch a page using WebShare proxies with retry logic"""
88
  logger.info(f"Requesting URL: {url}")
@@ -92,20 +118,21 @@ async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) ->
92
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
93
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
94
  "Accept-Language": "en-US,en;q=0.9",
95
- "Accept-Encoding": "gzip, deflate, br",
96
  "Connection": "keep-alive",
97
  "Upgrade-Insecure-Requests": "1",
 
 
 
 
98
  "Cache-Control": "max-age=0",
99
- "TE": "Trailers",
100
  "Referer": "https://www.google.com/"
101
  }
102
  # Update with any custom headers provided
103
  enhanced_headers.update(headers)
104
 
105
  # Try up to 3 different proxies
106
- max_proxy_attempts = 3
107
-
108
- for attempt in range(max_proxy_attempts):
109
  proxy_url = get_next_proxy()
110
  if not proxy_url:
111
  logger.error("Failed to get a valid proxy")
@@ -114,37 +141,20 @@ async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) ->
114
  logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
115
 
116
  try:
117
- # Try with this proxy
118
- async with session.get(
119
- url,
120
- headers=enhanced_headers,
121
- proxy=proxy_url,
122
- timeout=30,
123
- allow_redirects=True
124
- ) as response:
125
- if response.status == 200:
126
- content = await response.text()
127
-
128
- # Verify we got actual content (common anti-bot techniques return empty pages)
129
- if len(content) > 1000 and ("<html" in content or "<!DOCTYPE" in content):
130
- logger.info(f"Successfully retrieved content ({len(content)} bytes)")
131
- return content
132
- else:
133
- logger.warning(f"Response too small or not HTML: {len(content)} bytes")
134
- mark_proxy_failure(proxy_url)
135
  else:
136
- logger.warning(f"Response status code: {response.status}")
137
  mark_proxy_failure(proxy_url)
138
-
139
- except (aiohttp.ClientError, asyncio.TimeoutError) as e:
140
- logger.error(f"Request failed with proxy {proxy_url}: {str(e)}")
141
- mark_proxy_failure(proxy_url)
142
  except Exception as e:
143
- logger.error(f"Unexpected error: {str(e)}")
144
  mark_proxy_failure(proxy_url)
145
 
146
  # Wait before trying next proxy
147
- await asyncio.sleep(1)
148
 
149
  logger.error("All proxy attempts failed")
150
  return None
 
5
  from typing import Optional, List
6
  import time
7
  from dotenv import load_dotenv
8
+ from aiohttp_socks import ProxyConnector # Better proxy connector
9
 
10
  # Load environment variables
11
  load_dotenv()
 
27
  ]
28
 
29
  # Track proxy usage and failures
 
30
  proxy_failure_count = {}
31
  last_proxy_index = -1
32
 
 
53
  if proxy_failure_count.get(proxy_str, 0) >= 3:
54
  continue
55
 
 
 
 
56
  return format_proxy_url(proxy_str)
57
 
58
  # If all proxies have failures, reset failure counts and try again
59
  proxy_failure_count.clear()
60
+ logger.warning("All proxies have failure records, resetting counts")
61
  return get_next_proxy()
62
 
63
  def mark_proxy_failure(proxy_url: str):
 
68
  proxy_failure_count[proxy_str] = proxy_failure_count.get(proxy_str, 0) + 1
69
  logger.warning(f"Marked proxy as failed: {proxy_url} (failure count: {proxy_failure_count[proxy_str]})")
70
 
71
+ # Reset failure count after 5 minutes
72
  if proxy_failure_count[proxy_str] >= 3:
73
  logger.warning(f"Proxy {proxy_url} has failed multiple times, cooling down")
74
+ asyncio.create_task(reset_proxy_failure(proxy_str, 300)) # 5 minutes cooldown
75
  break
76
 
77
  async def reset_proxy_failure(proxy_str: str, delay: int):
 
81
  proxy_failure_count[proxy_str] = 0
82
  logger.info(f"Reset failure count for proxy: {proxy_str}")
83
 
84
+ async def fetch_with_session(session: aiohttp.ClientSession, url: str, proxy_url: str, headers: dict) -> Optional[str]:
85
+ """Make a request with the given session and proxy"""
86
+ try:
87
+ async with session.get(
88
+ url,
89
+ headers=headers,
90
+ proxy=proxy_url,
91
+ timeout=30,
92
+ allow_redirects=True
93
+ ) as response:
94
+ # Accept both 200 and 202 status codes
95
+ if response.status in (200, 202):
96
+ content = await response.text()
97
+
98
+ # Verify we got actual content
99
+ if len(content) > 1000 and ("<html" in content or "<!DOCTYPE" in content):
100
+ logger.info(f"Successfully retrieved content ({len(content)} bytes)")
101
+ return content
102
+ else:
103
+ logger.warning(f"Response too small or not HTML: {len(content)} bytes")
104
+ return None
105
+ else:
106
+ logger.warning(f"Response status code: {response.status}")
107
+ return None
108
+ except Exception as e:
109
+ logger.error(f"Request failed: {str(e)}")
110
+ return None
111
+
112
  async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
113
  """Fetch a page using WebShare proxies with retry logic"""
114
  logger.info(f"Requesting URL: {url}")
 
118
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
119
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
120
  "Accept-Language": "en-US,en;q=0.9",
121
+ # Don't specify Accept-Encoding to let aiohttp handle it with brotli installed
122
  "Connection": "keep-alive",
123
  "Upgrade-Insecure-Requests": "1",
124
+ "Sec-Fetch-Dest": "document",
125
+ "Sec-Fetch-Mode": "navigate",
126
+ "Sec-Fetch-Site": "none",
127
+ "Sec-Fetch-User": "?1",
128
  "Cache-Control": "max-age=0",
 
129
  "Referer": "https://www.google.com/"
130
  }
131
  # Update with any custom headers provided
132
  enhanced_headers.update(headers)
133
 
134
  # Try up to 3 different proxies
135
+ for attempt in range(3):
 
 
136
  proxy_url = get_next_proxy()
137
  if not proxy_url:
138
  logger.error("Failed to get a valid proxy")
 
141
  logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
142
 
143
  try:
144
+ # Create a new session for each proxy to avoid connection issues
145
+ connector = ProxyConnector.from_url(proxy_url)
146
+ async with aiohttp.ClientSession(connector=connector) as proxy_session:
147
+ content = await fetch_with_session(proxy_session, url, None, enhanced_headers)
148
+ if content:
149
+ return content
 
 
 
 
 
 
 
 
 
 
 
 
150
  else:
 
151
  mark_proxy_failure(proxy_url)
 
 
 
 
152
  except Exception as e:
153
+ logger.error(f"Request failed with proxy {proxy_url}: {str(e)}")
154
  mark_proxy_failure(proxy_url)
155
 
156
  # Wait before trying next proxy
157
+ await asyncio.sleep(2)
158
 
159
  logger.error("All proxy attempts failed")
160
  return None