Spaces:

garvitcpp
/

accomodation-info-api

Paused

App Files Files Community

garvitcpp commited on May 22, 2025

Commit

5a17107

verified ·

1 Parent(s): 7e7b0a9

Update core/scraper.py

Browse files

Files changed (1) hide show

core/scraper.py +36 -34

core/scraper.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import asyncio
-import aiohttp  # type: ignore
 import logging
 from typing import List, Dict, Any
 from services.booking_service import BookingService
@@ -17,50 +17,52 @@ class HotelScraper:
         """Scrape multiple hotels concurrently"""
         logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
-        # Use ClientSession with TCP connector options for better reliability
         connector = aiohttp.TCPConnector(
-            limit=5,            # Limit number of simultaneous connections
-            ttl_dns_cache=300,  # Cache DNS results for 5 minutes
-            force_close=True,   # Force close connections after each use
-            ssl=False           # Some proxies don't support SSL
         )
-        timeout = aiohttp.ClientTimeout(total=60)  # Longer timeout for proxies
         async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
-            tasks = []
-            for query in hotel_queries:
-                task = asyncio.create_task(self.booking_service.search_hotel(
-                    session=session,
-                    destination=query.destination,
-                    hotel_name=query.hotel_name
-                ))
-                tasks.append(task)
             # Process hotels in small batches to avoid overloading proxies
             processed_results = []
-            chunk_size = 2  # Process 2 hotels at a time
-            for i in range(0, len(tasks), chunk_size):
-                chunk = tasks[i:i+chunk_size]
-                chunk_results = await asyncio.gather(*chunk, return_exceptions=True)
-                # Process results
                 for j, result in enumerate(chunk_results):
                     query_index = i + j
-                    if query_index < len(hotel_queries):
-                        if isinstance(result, Exception):
-                            logger.error(f"Error scraping {hotel_queries[query_index].hotel_name}: {result}")
-                            processed_results.append({
-                                "destination": hotel_queries[query_index].destination,
-                                "hotel_name": hotel_queries[query_index].hotel_name,
-                                "error": f"Scraping failed: {str(result)}"
-                            })
-                        else:
-                            processed_results.append(result)
-                # Add delay between chunks
-                if i + chunk_size < len(tasks):
-                    await asyncio.sleep(2)  # 2 second delay between batches
             return processed_results

 import asyncio
+import aiohttp
 import logging
 from typing import List, Dict, Any
 from services.booking_service import BookingService
         """Scrape multiple hotels concurrently"""
         logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
+        # Configure client session with optimal settings for proxy use
         connector = aiohttp.TCPConnector(
+            limit=3,  # Limit concurrent connections to avoid proxy overload
+            ttl_dns_cache=300,  # Cache DNS results
+            force_close=True,  # Always close connections
+            enable_cleanup_closed=True,  # Clean up closed connections
         )
+        timeout = aiohttp.ClientTimeout(total=90)  # Longer timeout for proxies
         async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
             # Process hotels in small batches to avoid overloading proxies
             processed_results = []
+            chunk_size = 1  # Process one hotel at a time for better reliability
+            for i in range(0, len(hotel_queries), chunk_size):
+                chunk_queries = hotel_queries[i:i+chunk_size]
+                # Create tasks for this chunk
+                tasks = []
+                for query in chunk_queries:
+                    task = asyncio.create_task(self.booking_service.search_hotel(
+                        session=session,
+                        destination=query.destination,
+                        hotel_name=query.hotel_name
+                    ))
+                    tasks.append(task)
+                # Wait for all tasks in this chunk to complete
+                chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
+                # Process results from this chunk
                 for j, result in enumerate(chunk_results):
                     query_index = i + j
+                    if isinstance(result, Exception):
+                        logger.error(f"Error scraping {hotel_queries[query_index].hotel_name}: {result}")
+                        processed_results.append({
+                            "destination": hotel_queries[query_index].destination,
+                            "hotel_name": hotel_queries[query_index].hotel_name,
+                            "error": f"Scraping failed: {str(result)}"
+                        })
+                    else:
+                        processed_results.append(result)
+                # Add delay between chunks (important for proxy rotation)
+                if i + chunk_size < len(hotel_queries):
+                    await asyncio.sleep(2)
             return processed_results