File size: 3,052 Bytes
730ee00
5a17107
730ee00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a17107
23f8688
5a17107
 
 
 
23f8688
 
5a17107
23f8688
 
737ef25
730ee00
5a17107
23f8688
5a17107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23f8688
5a17107
737ef25
23f8688
5a17107
 
 
 
 
 
 
 
 
23f8688
5a17107
 
 
730ee00
28df1e8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import asyncio
import aiohttp
import logging
from typing import List, Dict, Any
from services.booking_service import BookingService
from models.requests import HotelQuery

logger = logging.getLogger(__name__)

class HotelScraper:
    """Main scraper class that coordinates the scraping process"""
    
    def __init__(self):
        self.booking_service = BookingService()
    
    async def scrape_hotels(self, hotel_queries: List[HotelQuery]) -> List[Dict[str, Any]]:
        """Scrape multiple hotels concurrently"""
        logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
        
        # Configure client session with optimal settings for proxy use
        connector = aiohttp.TCPConnector(
            limit=3,  # Limit concurrent connections to avoid proxy overload
            ttl_dns_cache=300,  # Cache DNS results
            force_close=True,  # Always close connections
            enable_cleanup_closed=True,  # Clean up closed connections
        )
        
        timeout = aiohttp.ClientTimeout(total=90)  # Longer timeout for proxies
        
        async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
            # Process hotels in small batches to avoid overloading proxies
            processed_results = []
            chunk_size = 1  # Process one hotel at a time for better reliability
            
            for i in range(0, len(hotel_queries), chunk_size):
                chunk_queries = hotel_queries[i:i+chunk_size]
                
                # Create tasks for this chunk
                tasks = []
                for query in chunk_queries:
                    task = asyncio.create_task(self.booking_service.search_hotel(
                        session=session,
                        destination=query.destination,
                        hotel_name=query.hotel_name
                    ))
                    tasks.append(task)
                
                # Wait for all tasks in this chunk to complete
                chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
                
                # Process results from this chunk
                for j, result in enumerate(chunk_results):
                    query_index = i + j
                    if isinstance(result, Exception):
                        logger.error(f"Error scraping {hotel_queries[query_index].hotel_name}: {result}")
                        processed_results.append({
                            "destination": hotel_queries[query_index].destination,
                            "hotel_name": hotel_queries[query_index].hotel_name,
                            "error": f"Scraping failed: {str(result)}"
                        })
                    else:
                        processed_results.append(result)
                
                # Add delay between chunks (important for proxy rotation)
                if i + chunk_size < len(hotel_queries):
                    await asyncio.sleep(2)
            
            return processed_results