Spaces:

garvitcpp
/

accomodation-info-api

Paused

App Files Files Community

garvitcpp commited on May 22, 2025

Commit

23f8688

verified ·

1 Parent(s): 675dcd6

Update core/scraper.py

Browse files

Files changed (1) hide show

core/scraper.py +36 -17

core/scraper.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import asyncio
-import aiohttp # type: ignore
 import logging
-import random
 from typing import List, Dict, Any
 from services.booking_service import BookingService
 from models.requests import HotelQuery
@@ -18,7 +17,18 @@ class HotelScraper:
         """Scrape multiple hotels concurrently"""
         logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
-        async with aiohttp.ClientSession() as session:
             tasks = []
             for query in hotel_queries:
                 task = self.booking_service.search_hotel(
@@ -28,20 +38,29 @@ class HotelScraper:
                 )
                 tasks.append(task)
-            # Run all tasks concurrently
-            results = await asyncio.gather(*tasks, return_exceptions=True)
-            # Handle any exceptions
             processed_results = []
-            for i, result in enumerate(results):
-                if isinstance(result, Exception):
-                    logger.error(f"Error scraping hotel {hotel_queries[i].hotel_name}: {result}")
-                    processed_results.append({
-                        "destination": hotel_queries[i].destination,
-                        "hotel_name": hotel_queries[i].hotel_name,
-                        "error": f"Scraping failed: {str(result)}"
-                    })
-                else:
-                    processed_results.append(result)
             return processed_results

 import asyncio
+import aiohttp  # type: ignore
 import logging
 from typing import List, Dict, Any
 from services.booking_service import BookingService
 from models.requests import HotelQuery
         """Scrape multiple hotels concurrently"""
         logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
+        # Use ClientSession with TCP connector options for better reliability
+        connector = aiohttp.TCPConnector(
+            limit=5,  # Limit number of simultaneous connections
+            ttl_dns_cache=300,  # Cache DNS results for 5 minutes
+            enable_cleanup_closed=True,  # Clean up closed connections
+            force_close=True,  # Force close connections
+            ssl=False  # Some proxies don't support SSL
+        )
+        timeout = aiohttp.ClientTimeout(total=60)  # Longer timeout for proxies
+        async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
             tasks = []
             for query in hotel_queries:
                 task = self.booking_service.search_hotel(
                 )
                 tasks.append(task)
+            # Run all tasks concurrently with a limit to avoid overloading proxies
             processed_results = []
+            chunk_size = 3  # Process 3 hotels at a time
+            for i in range(0, len(tasks), chunk_size):
+                chunk = tasks[i:i+chunk_size]
+                results = await asyncio.gather(*chunk, return_exceptions=True)
+                for j, result in enumerate(results):
+                    query_index = i + j
+                    if query_index < len(hotel_queries):  # Safety check
+                        if isinstance(result, Exception):
+                            logger.error(f"Error scraping hotel {hotel_queries[query_index].hotel_name}: {result}")
+                            processed_results.append({
+                                "destination": hotel_queries[query_index].destination,
+                                "hotel_name": hotel_queries[query_index].hotel_name,
+                                "error": f"Scraping failed: {str(result)}"
+                            })
+                        else:
+                            processed_results.append(result)
+                # Small delay between chunks to be gentle on proxies
+                if i + chunk_size < len(tasks):
+                    await asyncio.sleep(1)
             return processed_results