garvitcpp commited on
Commit
c297293
·
verified ·
1 Parent(s): 84e3e1d

Update services/utils/http_utils.py

Browse files
Files changed (1) hide show
  1. services/utils/http_utils.py +94 -79
services/utils/http_utils.py CHANGED
@@ -2,16 +2,9 @@ import aiohttp
2
  import logging
3
  import random
4
  import asyncio
5
- from typing import Optional, List
6
- import time
7
- from dotenv import load_dotenv
8
 
9
- # Load environment variables
10
- load_dotenv()
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
- # WebShare proxies list (format: IP:PORT:USERNAME:PASSWORD)
15
  WEBSHARE_PROXIES = [
16
  "198.23.239.134:6540:zvubytfw:ak6yit5k2tvj",
17
  "207.244.217.165:6712:zvubytfw:ak6yit5k2tvj",
@@ -25,108 +18,130 @@ WEBSHARE_PROXIES = [
25
  "154.36.110.199:6853:zvubytfw:ak6yit5k2tvj"
26
  ]
27
 
28
- # Track proxy usage and failures
29
- proxy_failure_count = {}
30
- last_proxy_index = -1
31
 
32
- def format_proxy_url(proxy_str: str) -> str:
33
- """Convert proxy string to proxy URL format"""
34
- parts = proxy_str.split(':')
35
- if len(parts) != 4:
36
- logger.error(f"Invalid proxy format: {proxy_str}")
37
- return None
38
-
39
- ip, port, username, password = parts
40
- return f"http://{username}:{password}@{ip}:{port}"
41
 
42
- def get_next_proxy() -> str:
43
- """Get the next proxy using a round-robin approach with failure consideration"""
44
- global last_proxy_index
45
 
46
- # Simple round-robin selection with failure skipping
47
  for _ in range(len(WEBSHARE_PROXIES)):
48
- last_proxy_index = (last_proxy_index + 1) % len(WEBSHARE_PROXIES)
49
- proxy_str = WEBSHARE_PROXIES[last_proxy_index]
50
 
51
- # Skip proxies with too many recent failures
52
- if proxy_failure_count.get(proxy_str, 0) >= 3:
53
  continue
54
 
55
- return format_proxy_url(proxy_str)
 
 
 
 
56
 
57
- # If all proxies have failures, reset failure counts and try again
58
- proxy_failure_count.clear()
59
- logger.warning("All proxies have failure records, resetting counts")
60
- return get_next_proxy()
61
 
62
- def mark_proxy_failure(proxy_url: str):
63
- """Mark a proxy as having a failure"""
64
- # Extract the original proxy string from the URL
65
  for proxy_str in WEBSHARE_PROXIES:
66
- if proxy_str.split(':')[0] in proxy_url and proxy_str.split(':')[2] in proxy_url:
67
- proxy_failure_count[proxy_str] = proxy_failure_count.get(proxy_str, 0) + 1
68
- logger.warning(f"Marked proxy as failed: {proxy_url} (failure count: {proxy_failure_count[proxy_str]})")
69
-
70
- # Reset failure count after 5 minutes
71
- if proxy_failure_count[proxy_str] >= 3:
72
- logger.warning(f"Proxy {proxy_url} has failed multiple times, cooling down")
73
- asyncio.create_task(reset_proxy_failure(proxy_str, 300)) # 5 minutes cooldown
74
  break
75
 
76
- async def reset_proxy_failure(proxy_str: str, delay: int):
77
- """Reset the failure count for a proxy after a delay"""
78
  await asyncio.sleep(delay)
79
- if proxy_str in proxy_failure_count:
80
- proxy_failure_count[proxy_str] = 0
81
- logger.info(f"Reset failure count for proxy: {proxy_str}")
82
 
83
  async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
84
- """Fetch a page using WebShare proxies with retry logic"""
85
  logger.info(f"Requesting URL: {url}")
86
 
87
- # Try up to 3 different proxies
88
- max_proxy_attempts = 3
 
 
 
 
 
 
 
 
 
 
89
 
90
- for attempt in range(max_proxy_attempts):
91
- proxy_url = get_next_proxy()
92
- if not proxy_url:
93
- logger.error("Failed to get a valid proxy")
94
- return None
95
-
96
  logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
97
 
 
 
 
 
 
 
 
 
 
 
98
  try:
99
- # Try with this proxy
100
  async with session.get(
101
  url,
102
- headers=headers,
103
  proxy=proxy_url,
104
- timeout=30,
105
- ssl=False
 
106
  ) as response:
107
  if response.status in [200, 202]:
108
  content = await response.text()
109
 
110
- # Verify we got actual content (common anti-bot techniques return empty pages)
111
- if len(content) > 1000 and ("<html" in content or "<!DOCTYPE" in content):
112
- logger.info(f"Successfully retrieved content ({len(content)} bytes)")
113
- return content
114
- else:
115
- logger.warning(f"Response too small or not HTML: {len(content)} bytes")
116
- mark_proxy_failure(proxy_url)
 
 
 
 
 
117
  else:
118
- logger.warning(f"Response status code: {response.status}")
119
  mark_proxy_failure(proxy_url)
120
-
121
- except (aiohttp.ClientError, asyncio.TimeoutError) as e:
122
- logger.error(f"Request failed with proxy {proxy_url}: {str(e)}")
123
- mark_proxy_failure(proxy_url)
124
  except Exception as e:
125
- logger.error(f"Unexpected error: {str(e)}")
126
  mark_proxy_failure(proxy_url)
127
 
128
- # Wait before trying next proxy
129
- await asyncio.sleep(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
- logger.error("All proxy attempts failed")
132
  return None
 
2
  import logging
3
  import random
4
  import asyncio
5
+ from typing import Optional
 
 
6
 
7
+ # WebShare proxies
 
 
 
 
 
8
  WEBSHARE_PROXIES = [
9
  "198.23.239.134:6540:zvubytfw:ak6yit5k2tvj",
10
  "207.244.217.165:6712:zvubytfw:ak6yit5k2tvj",
 
18
  "154.36.110.199:6853:zvubytfw:ak6yit5k2tvj"
19
  ]
20
 
21
+ # Track proxy performance
22
+ proxy_failures = {}
23
+ current_proxy_idx = -1
24
 
25
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
26
 
27
+ def get_proxy_url() -> str:
28
+ """Get a proxy URL using round-robin with failure detection"""
29
+ global current_proxy_idx
30
 
31
+ # Try all proxies if necessary
32
  for _ in range(len(WEBSHARE_PROXIES)):
33
+ current_proxy_idx = (current_proxy_idx + 1) % len(WEBSHARE_PROXIES)
34
+ proxy_str = WEBSHARE_PROXIES[current_proxy_idx]
35
 
36
+ # Skip frequently failing proxies
37
+ if proxy_failures.get(proxy_str, 0) >= 3:
38
  continue
39
 
40
+ # Format proxy for aiohttp
41
+ parts = proxy_str.split(':')
42
+ if len(parts) == 4:
43
+ ip, port, username, password = parts
44
+ return f"http://{username}:{password}@{ip}:{port}"
45
 
46
+ # If all proxies have failures, reset and try again
47
+ proxy_failures.clear()
48
+ return get_proxy_url()
 
49
 
50
+ def mark_proxy_failure(proxy_url: str) -> None:
51
+ """Mark a proxy as failing"""
 
52
  for proxy_str in WEBSHARE_PROXIES:
53
+ if proxy_str.split(':')[0] in proxy_url:
54
+ proxy_failures[proxy_str] = proxy_failures.get(proxy_str, 0) + 1
55
+ if proxy_failures[proxy_str] >= 3:
56
+ asyncio.create_task(reset_proxy_after_delay(proxy_str, 300))
 
 
 
 
57
  break
58
 
59
+ async def reset_proxy_after_delay(proxy_str: str, delay: int) -> None:
60
+ """Reset a proxy's failure count after a delay"""
61
  await asyncio.sleep(delay)
62
+ if proxy_str in proxy_failures:
63
+ proxy_failures[proxy_str] = 0
 
64
 
65
  async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
66
+ """Fetch a page using proxies with retry logic"""
67
  logger.info(f"Requesting URL: {url}")
68
 
69
+ # Enhanced headers
70
+ enhanced_headers = headers.copy()
71
+ enhanced_headers.update({
72
+ "Accept-Encoding": "gzip, deflate, br", # Explicitly specify brotli support
73
+ "Cache-Control": "no-cache",
74
+ "Pragma": "no-cache",
75
+ "Sec-Fetch-Dest": "document",
76
+ "Sec-Fetch-Mode": "navigate",
77
+ "Sec-Fetch-Site": "none",
78
+ "Sec-Fetch-User": "?1",
79
+ "Referer": "https://www.google.com/",
80
+ })
81
 
82
+ # Try up to 3 proxies
83
+ for attempt in range(3):
84
+ proxy_url = get_proxy_url()
 
 
 
85
  logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
86
 
87
+ # Special adjustments for "hosteller" searches
88
+ if "hosteller" in url.lower():
89
+ enhanced_headers["Referer"] = "https://www.google.com/search?q=hosteller+hotels+india"
90
+ # Remove any location qualifiers for better results
91
+ if "old+manali" in url.lower():
92
+ url = url.replace("old+manali", "manali")
93
+ elif "narkanda" in url.lower() and "hosteller" in url.lower():
94
+ # Try a more general search for Narkanda hostels
95
+ url = url.replace("the+hosteller+narkanda", "hostels+in+narkanda")
96
+
97
  try:
98
+ # Use a longer timeout for proxies
99
  async with session.get(
100
  url,
101
+ headers=enhanced_headers,
102
  proxy=proxy_url,
103
+ timeout=25,
104
+ ssl=False,
105
+ allow_redirects=True
106
  ) as response:
107
  if response.status in [200, 202]:
108
  content = await response.text()
109
 
110
+ # Check if we got actual content, not a bot detection page
111
+ if len(content) > 5000 and ("<html" in content or "<!DOCTYPE" in content):
112
+ # If this is a search page, additional validation
113
+ if "searchresults" in url or "search" in url:
114
+ if "property-card" in content or "sr_property_block" in content:
115
+ logger.info(f"Successfully retrieved search results ({len(content)} bytes)")
116
+ return content
117
+ else:
118
+ logger.warning("No property cards found in search results")
119
+ else:
120
+ logger.info(f"Successfully retrieved content ({len(content)} bytes)")
121
+ return content
122
  else:
123
+ logger.warning(f"Response status {response.status} from proxy {proxy_url}")
124
  mark_proxy_failure(proxy_url)
 
 
 
 
125
  except Exception as e:
126
+ logger.error(f"Proxy request failed: {str(e)}")
127
  mark_proxy_failure(proxy_url)
128
 
129
+ # Wait before next attempt
130
+ await asyncio.sleep(2)
131
+
132
+ # If proxies failed, try direct connection as last resort
133
+ logger.warning("All proxies failed, trying direct connection")
134
+ try:
135
+ async with session.get(
136
+ url,
137
+ headers=enhanced_headers,
138
+ timeout=15
139
+ ) as response:
140
+ if response.status == 200:
141
+ content = await response.text()
142
+ if len(content) > 5000:
143
+ return content
144
+ except Exception as e:
145
+ logger.error(f"Direct request also failed: {str(e)}")
146
 
 
147
  return None