garvitcpp commited on
Commit
675dcd6
·
verified ·
1 Parent(s): 5739cff

Update services/utils/http_utils.py

Browse files
Files changed (1) hide show
  1. services/utils/http_utils.py +112 -8
services/utils/http_utils.py CHANGED
@@ -1,20 +1,124 @@
1
- import aiohttp # type: ignore
2
  import logging
3
- from typing import Optional
 
 
 
 
 
4
 
5
  logger = logging.getLogger(__name__)
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
8
- """Fetch a page using aiohttp"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  try:
10
- logger.info(f"Requesting URL: {url}")
11
- async with session.get(url, headers=headers, timeout=15) as response:
 
 
 
12
  if response.status == 200:
13
- logger.debug(f"Successfully retrieved content from {url}")
14
  return await response.text()
15
  else:
16
- logger.error(f"Error retrieving URL {url}: Status code {response.status}")
17
  return None
18
  except Exception as e:
19
- logger.error(f"Request failed for {url}: {e}")
20
  return None
 
1
+ import aiohttp
2
  import logging
3
+ from typing import Optional, List
4
+ import asyncio
5
+ from fp.fp import FreeProxy
6
+ import random
7
+ from aiohttp_retry import RetryClient, ExponentialRetry
8
+ import time
9
 
10
  logger = logging.getLogger(__name__)
11
 
12
+ # Cache for working proxies
13
+ WORKING_PROXIES = []
14
+ PROXY_REFRESH_TIME = 0
15
+ PROXY_REFRESH_INTERVAL = 60 * 10 # 10 minutes
16
+
17
+ def get_working_proxies() -> List[str]:
18
+ """Get a list of working proxies"""
19
+ global WORKING_PROXIES, PROXY_REFRESH_TIME
20
+
21
+ current_time = time.time()
22
+
23
+ # If we have proxies and they're not expired, use them
24
+ if WORKING_PROXIES and (current_time - PROXY_REFRESH_TIME) < PROXY_REFRESH_INTERVAL:
25
+ return WORKING_PROXIES
26
+
27
+ # Get new proxies
28
+ try:
29
+ proxies = []
30
+ # Try to get 5 working proxies
31
+ for _ in range(5):
32
+ try:
33
+ proxy = FreeProxy(https=True, rand=True, timeout=1).get()
34
+ if proxy and proxy not in proxies:
35
+ proxies.append(proxy)
36
+ except Exception:
37
+ pass
38
+
39
+ if proxies:
40
+ WORKING_PROXIES = proxies
41
+ PROXY_REFRESH_TIME = current_time
42
+ logger.info(f"Refreshed proxy list, found {len(proxies)} working proxies")
43
+ return WORKING_PROXIES
44
+ except Exception as e:
45
+ logger.error(f"Error refreshing proxy list: {e}")
46
+
47
+ return WORKING_PROXIES # Return whatever we have, even if it's empty
48
+
49
  async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
50
+ """Fetch a page using aiohttp with free proxies and retry logic"""
51
+ logger.info(f"Requesting URL: {url}")
52
+
53
+ # Get list of working proxies
54
+ proxies = get_working_proxies()
55
+
56
+ # Try with proxies if available
57
+ if proxies:
58
+ # Shuffle proxies for better distribution
59
+ random.shuffle(proxies)
60
+
61
+ # Try each proxy until one works
62
+ for proxy in proxies:
63
+ try:
64
+ logger.info(f"Trying with proxy: {proxy}")
65
+ # Configure retry client
66
+ retry_options = ExponentialRetry(attempts=2)
67
+ retry_client = RetryClient(raise_for_status=False, retry_options=retry_options)
68
+
69
+ async with retry_client.get(
70
+ url,
71
+ headers=headers,
72
+ proxy=proxy,
73
+ timeout=20,
74
+ ssl=False # Some free proxies don't support SSL verification
75
+ ) as response:
76
+ if response.status == 200:
77
+ logger.info(f"Successfully retrieved content via proxy")
78
+ return await response.text()
79
+ else:
80
+ logger.warning(f"Proxy {proxy} failed with status {response.status}")
81
+ except Exception as e:
82
+ logger.warning(f"Error using proxy {proxy}: {str(e)}")
83
+ continue
84
+
85
+ # If all proxies failed or no proxies available, try direct request with extensive disguise
86
+ logger.info("All proxies failed or no proxies available, trying direct request with disguised headers")
87
+ return await direct_request(session, url, headers)
88
+
89
+ async def direct_request(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
90
+ """Attempt a direct request with enhanced browser-like headers"""
91
+ # Enhance headers to look more like a real browser
92
+ enhanced_headers = headers.copy()
93
+ enhanced_headers.update({
94
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
95
+ "Accept-Language": "en-US,en;q=0.9",
96
+ "Accept-Encoding": "gzip, deflate, br",
97
+ "Connection": "keep-alive",
98
+ "Cache-Control": "max-age=0",
99
+ "Sec-Ch-Ua": '"Google Chrome";v="123", "Not:A-Brand";v="8"',
100
+ "Sec-Ch-Ua-Mobile": "?0",
101
+ "Sec-Ch-Ua-Platform": '"Windows"',
102
+ "Sec-Fetch-Dest": "document",
103
+ "Sec-Fetch-Mode": "navigate",
104
+ "Sec-Fetch-Site": "none",
105
+ "Sec-Fetch-User": "?1",
106
+ "Upgrade-Insecure-Requests": "1",
107
+ "Referer": "https://www.google.com/"
108
+ })
109
+
110
  try:
111
+ # Configure retry client for direct requests too
112
+ retry_options = ExponentialRetry(attempts=3)
113
+ retry_client = RetryClient(raise_for_status=False, retry_options=retry_options)
114
+
115
+ async with retry_client.get(url, headers=enhanced_headers, timeout=20) as response:
116
  if response.status == 200:
117
+ logger.info(f"Successfully retrieved content directly")
118
  return await response.text()
119
  else:
120
+ logger.error(f"Direct request failed with status code {response.status}")
121
  return None
122
  except Exception as e:
123
+ logger.error(f"Direct request failed: {e}")
124
  return None