Spaces:

garvitcpp
/

accomodation-info-api

Paused

App Files Files Community

accomodation-info-api / core /scraper.py

garvitcpp

Update core/scraper.py

5a17107 verified 11 months ago

raw

history blame contribute delete

3.05 kB

	import asyncio
	import aiohttp
	import logging
	from typing import List, Dict, Any
	from services.booking_service import BookingService
	from models.requests import HotelQuery

	logger = logging.getLogger(__name__)

	class HotelScraper:
	"""Main scraper class that coordinates the scraping process"""

	def __init__(self):
	self.booking_service = BookingService()

	async def scrape_hotels(self, hotel_queries: List[HotelQuery]) -> List[Dict[str, Any]]:
	"""Scrape multiple hotels concurrently"""
	logger.info(f"Starting to scrape {len(hotel_queries)} hotels")

	# Configure client session with optimal settings for proxy use
	connector = aiohttp.TCPConnector(
	limit=3, # Limit concurrent connections to avoid proxy overload
	ttl_dns_cache=300, # Cache DNS results
	force_close=True, # Always close connections
	enable_cleanup_closed=True, # Clean up closed connections
	)

	timeout = aiohttp.ClientTimeout(total=90) # Longer timeout for proxies

	async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
	# Process hotels in small batches to avoid overloading proxies
	processed_results = []
	chunk_size = 1 # Process one hotel at a time for better reliability

	for i in range(0, len(hotel_queries), chunk_size):
	chunk_queries = hotel_queries[i:i+chunk_size]

	# Create tasks for this chunk
	tasks = []
	for query in chunk_queries:
	task = asyncio.create_task(self.booking_service.search_hotel(
	session=session,
	destination=query.destination,
	hotel_name=query.hotel_name
	))
	tasks.append(task)

	# Wait for all tasks in this chunk to complete
	chunk_results = await asyncio.gather(*tasks, return_exceptions=True)

	# Process results from this chunk
	for j, result in enumerate(chunk_results):
	query_index = i + j
	if isinstance(result, Exception):
	logger.error(f"Error scraping {hotel_queries[query_index].hotel_name}: {result}")
	processed_results.append({
	"destination": hotel_queries[query_index].destination,
	"hotel_name": hotel_queries[query_index].hotel_name,
	"error": f"Scraping failed: {str(result)}"
	})
	else:
	processed_results.append(result)

	# Add delay between chunks (important for proxy rotation)
	if i + chunk_size < len(hotel_queries):
	await asyncio.sleep(2)

	return processed_results