Spaces:

garvitcpp
/

accomodation-info-api

Paused

App Files Files Community

garvitcpp commited on May 22, 2025

Commit

c297293

verified ·

1 Parent(s): 84e3e1d

Update services/utils/http_utils.py

Browse files

Files changed (1) hide show

services/utils/http_utils.py +94 -79

services/utils/http_utils.py CHANGED Viewed

@@ -2,16 +2,9 @@ import aiohttp
 import logging
 import random
 import asyncio
-from typing import Optional, List
-import time
-from dotenv import load_dotenv
-# Load environment variables
-load_dotenv()
-logger = logging.getLogger(__name__)
-# WebShare proxies list (format: IP:PORT:USERNAME:PASSWORD)
 WEBSHARE_PROXIES = [
     "198.23.239.134:6540:zvubytfw:ak6yit5k2tvj",
     "207.244.217.165:6712:zvubytfw:ak6yit5k2tvj",
@@ -25,108 +18,130 @@ WEBSHARE_PROXIES = [
     "154.36.110.199:6853:zvubytfw:ak6yit5k2tvj"
 ]
-# Track proxy usage and failures
-proxy_failure_count = {}
-last_proxy_index = -1
-def format_proxy_url(proxy_str: str) -> str:
-    """Convert proxy string to proxy URL format"""
-    parts = proxy_str.split(':')
-    if len(parts) != 4:
-        logger.error(f"Invalid proxy format: {proxy_str}")
-        return None
-    ip, port, username, password = parts
-    return f"http://{username}:{password}@{ip}:{port}"
-def get_next_proxy() -> str:
-    """Get the next proxy using a round-robin approach with failure consideration"""
-    global last_proxy_index
-    # Simple round-robin selection with failure skipping
     for _ in range(len(WEBSHARE_PROXIES)):
-        last_proxy_index = (last_proxy_index + 1) % len(WEBSHARE_PROXIES)
-        proxy_str = WEBSHARE_PROXIES[last_proxy_index]
-        # Skip proxies with too many recent failures
-        if proxy_failure_count.get(proxy_str, 0) >= 3:
             continue
-        return format_proxy_url(proxy_str)
-    # If all proxies have failures, reset failure counts and try again
-    proxy_failure_count.clear()
-    logger.warning("All proxies have failure records, resetting counts")
-    return get_next_proxy()
-def mark_proxy_failure(proxy_url: str):
-    """Mark a proxy as having a failure"""
-    # Extract the original proxy string from the URL
     for proxy_str in WEBSHARE_PROXIES:
-        if proxy_str.split(':')[0] in proxy_url and proxy_str.split(':')[2] in proxy_url:
-            proxy_failure_count[proxy_str] = proxy_failure_count.get(proxy_str, 0) + 1
-            logger.warning(f"Marked proxy as failed: {proxy_url} (failure count: {proxy_failure_count[proxy_str]})")
-            # Reset failure count after 5 minutes
-            if proxy_failure_count[proxy_str] >= 3:
-                logger.warning(f"Proxy {proxy_url} has failed multiple times, cooling down")
-                asyncio.create_task(reset_proxy_failure(proxy_str, 300))  # 5 minutes cooldown
             break
-async def reset_proxy_failure(proxy_str: str, delay: int):
-    """Reset the failure count for a proxy after a delay"""
     await asyncio.sleep(delay)
-    if proxy_str in proxy_failure_count:
-        proxy_failure_count[proxy_str] = 0
-        logger.info(f"Reset failure count for proxy: {proxy_str}")
 async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
-    """Fetch a page using WebShare proxies with retry logic"""
     logger.info(f"Requesting URL: {url}")
-    # Try up to 3 different proxies
-    max_proxy_attempts = 3
-    for attempt in range(max_proxy_attempts):
-        proxy_url = get_next_proxy()
-        if not proxy_url:
-            logger.error("Failed to get a valid proxy")
-            return None
         logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
         try:
-            # Try with this proxy
             async with session.get(
                 url,
-                headers=headers,
                 proxy=proxy_url,
-                timeout=30,
-                ssl=False
             ) as response:
                 if response.status in [200, 202]:
                     content = await response.text()
-                    # Verify we got actual content (common anti-bot techniques return empty pages)
-                    if len(content) > 1000 and ("<html" in content or "<!DOCTYPE" in content):
-                        logger.info(f"Successfully retrieved content ({len(content)} bytes)")
-                        return content
-                    else:
-                        logger.warning(f"Response too small or not HTML: {len(content)} bytes")
-                        mark_proxy_failure(proxy_url)
                 else:
-                    logger.warning(f"Response status code: {response.status}")
                     mark_proxy_failure(proxy_url)
-        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
-            logger.error(f"Request failed with proxy {proxy_url}: {str(e)}")
-            mark_proxy_failure(proxy_url)
         except Exception as e:
-            logger.error(f"Unexpected error: {str(e)}")
             mark_proxy_failure(proxy_url)
-        # Wait before trying next proxy
-        await asyncio.sleep(1)
-    logger.error("All proxy attempts failed")
     return None

 import logging
 import random
 import asyncio
+from typing import Optional
+# WebShare proxies
 WEBSHARE_PROXIES = [
     "198.23.239.134:6540:zvubytfw:ak6yit5k2tvj",
     "207.244.217.165:6712:zvubytfw:ak6yit5k2tvj",
     "154.36.110.199:6853:zvubytfw:ak6yit5k2tvj"
 ]
+# Track proxy performance
+proxy_failures = {}
+current_proxy_idx = -1
+logger = logging.getLogger(__name__)
+def get_proxy_url() -> str:
+    """Get a proxy URL using round-robin with failure detection"""
+    global current_proxy_idx
+    # Try all proxies if necessary
     for _ in range(len(WEBSHARE_PROXIES)):
+        current_proxy_idx = (current_proxy_idx + 1) % len(WEBSHARE_PROXIES)
+        proxy_str = WEBSHARE_PROXIES[current_proxy_idx]
+        # Skip frequently failing proxies
+        if proxy_failures.get(proxy_str, 0) >= 3:
             continue
+        # Format proxy for aiohttp
+        parts = proxy_str.split(':')
+        if len(parts) == 4:
+            ip, port, username, password = parts
+            return f"http://{username}:{password}@{ip}:{port}"
+    # If all proxies have failures, reset and try again
+    proxy_failures.clear()
+    return get_proxy_url()
+def mark_proxy_failure(proxy_url: str) -> None:
+    """Mark a proxy as failing"""
     for proxy_str in WEBSHARE_PROXIES:
+        if proxy_str.split(':')[0] in proxy_url:
+            proxy_failures[proxy_str] = proxy_failures.get(proxy_str, 0) + 1
+            if proxy_failures[proxy_str] >= 3:
+                asyncio.create_task(reset_proxy_after_delay(proxy_str, 300))
             break
+async def reset_proxy_after_delay(proxy_str: str, delay: int) -> None:
+    """Reset a proxy's failure count after a delay"""
     await asyncio.sleep(delay)
+    if proxy_str in proxy_failures:
+        proxy_failures[proxy_str] = 0
 async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
+    """Fetch a page using proxies with retry logic"""
     logger.info(f"Requesting URL: {url}")
+    # Enhanced headers
+    enhanced_headers = headers.copy()
+    enhanced_headers.update({
+        "Accept-Encoding": "gzip, deflate, br",  # Explicitly specify brotli support
+        "Cache-Control": "no-cache",
+        "Pragma": "no-cache",
+        "Sec-Fetch-Dest": "document",
+        "Sec-Fetch-Mode": "navigate",
+        "Sec-Fetch-Site": "none",
+        "Sec-Fetch-User": "?1",
+        "Referer": "https://www.google.com/",
+    })
+    # Try up to 3 proxies
+    for attempt in range(3):
+        proxy_url = get_proxy_url()
         logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
+        # Special adjustments for "hosteller" searches
+        if "hosteller" in url.lower():
+            enhanced_headers["Referer"] = "https://www.google.com/search?q=hosteller+hotels+india"
+            # Remove any location qualifiers for better results
+            if "old+manali" in url.lower():
+                url = url.replace("old+manali", "manali")
+            elif "narkanda" in url.lower() and "hosteller" in url.lower():
+                # Try a more general search for Narkanda hostels
+                url = url.replace("the+hosteller+narkanda", "hostels+in+narkanda")
         try:
+            # Use a longer timeout for proxies
             async with session.get(
                 url,
+                headers=enhanced_headers,
                 proxy=proxy_url,
+                timeout=25,
+                ssl=False,
+                allow_redirects=True
             ) as response:
                 if response.status in [200, 202]:
                     content = await response.text()
+                    # Check if we got actual content, not a bot detection page
+                    if len(content) > 5000 and ("<html" in content or "<!DOCTYPE" in content):
+                        # If this is a search page, additional validation
+                        if "searchresults" in url or "search" in url:
+                            if "property-card" in content or "sr_property_block" in content:
+                                logger.info(f"Successfully retrieved search results ({len(content)} bytes)")
+                                return content
+                            else:
+                                logger.warning("No property cards found in search results")
+                        else:
+                            logger.info(f"Successfully retrieved content ({len(content)} bytes)")
+                            return content
                 else:
+                    logger.warning(f"Response status {response.status} from proxy {proxy_url}")
                     mark_proxy_failure(proxy_url)
         except Exception as e:
+            logger.error(f"Proxy request failed: {str(e)}")
             mark_proxy_failure(proxy_url)
+        # Wait before next attempt
+        await asyncio.sleep(2)
+    # If proxies failed, try direct connection as last resort
+    logger.warning("All proxies failed, trying direct connection")
+    try:
+        async with session.get(
+            url,
+            headers=enhanced_headers,
+            timeout=15
+        ) as response:
+            if response.status == 200:
+                content = await response.text()
+                if len(content) > 5000:
+                    return content
+    except Exception as e:
+        logger.error(f"Direct request also failed: {str(e)}")
     return None