import asyncio import aiohttp import logging from typing import List, Dict, Any from services.booking_service import BookingService from models.requests import HotelQuery logger = logging.getLogger(__name__) class HotelScraper: """Main scraper class that coordinates the scraping process""" def __init__(self): self.booking_service = BookingService() async def scrape_hotels(self, hotel_queries: List[HotelQuery]) -> List[Dict[str, Any]]: """Scrape multiple hotels concurrently""" logger.info(f"Starting to scrape {len(hotel_queries)} hotels") # Configure client session with optimal settings for proxy use connector = aiohttp.TCPConnector( limit=3, # Limit concurrent connections to avoid proxy overload ttl_dns_cache=300, # Cache DNS results force_close=True, # Always close connections enable_cleanup_closed=True, # Clean up closed connections ) timeout = aiohttp.ClientTimeout(total=90) # Longer timeout for proxies async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session: # Process hotels in small batches to avoid overloading proxies processed_results = [] chunk_size = 1 # Process one hotel at a time for better reliability for i in range(0, len(hotel_queries), chunk_size): chunk_queries = hotel_queries[i:i+chunk_size] # Create tasks for this chunk tasks = [] for query in chunk_queries: task = asyncio.create_task(self.booking_service.search_hotel( session=session, destination=query.destination, hotel_name=query.hotel_name )) tasks.append(task) # Wait for all tasks in this chunk to complete chunk_results = await asyncio.gather(*tasks, return_exceptions=True) # Process results from this chunk for j, result in enumerate(chunk_results): query_index = i + j if isinstance(result, Exception): logger.error(f"Error scraping {hotel_queries[query_index].hotel_name}: {result}") processed_results.append({ "destination": hotel_queries[query_index].destination, "hotel_name": hotel_queries[query_index].hotel_name, "error": f"Scraping failed: {str(result)}" }) else: processed_results.append(result) # Add delay between chunks (important for proxy rotation) if i + chunk_size < len(hotel_queries): await asyncio.sleep(2) return processed_results