garvitcpp commited on
Commit
5a17107
·
verified ·
1 Parent(s): 7e7b0a9

Update core/scraper.py

Browse files
Files changed (1) hide show
  1. core/scraper.py +36 -34
core/scraper.py CHANGED
@@ -1,5 +1,5 @@
1
  import asyncio
2
- import aiohttp # type: ignore
3
  import logging
4
  from typing import List, Dict, Any
5
  from services.booking_service import BookingService
@@ -17,50 +17,52 @@ class HotelScraper:
17
  """Scrape multiple hotels concurrently"""
18
  logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
19
 
20
- # Use ClientSession with TCP connector options for better reliability
21
  connector = aiohttp.TCPConnector(
22
- limit=5, # Limit number of simultaneous connections
23
- ttl_dns_cache=300, # Cache DNS results for 5 minutes
24
- force_close=True, # Force close connections after each use
25
- ssl=False # Some proxies don't support SSL
26
  )
27
 
28
- timeout = aiohttp.ClientTimeout(total=60) # Longer timeout for proxies
29
 
30
  async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
31
- tasks = []
32
- for query in hotel_queries:
33
- task = asyncio.create_task(self.booking_service.search_hotel(
34
- session=session,
35
- destination=query.destination,
36
- hotel_name=query.hotel_name
37
- ))
38
- tasks.append(task)
39
-
40
  # Process hotels in small batches to avoid overloading proxies
41
  processed_results = []
42
- chunk_size = 2 # Process 2 hotels at a time
43
 
44
- for i in range(0, len(tasks), chunk_size):
45
- chunk = tasks[i:i+chunk_size]
46
- chunk_results = await asyncio.gather(*chunk, return_exceptions=True)
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- # Process results
49
  for j, result in enumerate(chunk_results):
50
  query_index = i + j
51
- if query_index < len(hotel_queries):
52
- if isinstance(result, Exception):
53
- logger.error(f"Error scraping {hotel_queries[query_index].hotel_name}: {result}")
54
- processed_results.append({
55
- "destination": hotel_queries[query_index].destination,
56
- "hotel_name": hotel_queries[query_index].hotel_name,
57
- "error": f"Scraping failed: {str(result)}"
58
- })
59
- else:
60
- processed_results.append(result)
61
 
62
- # Add delay between chunks
63
- if i + chunk_size < len(tasks):
64
- await asyncio.sleep(2) # 2 second delay between batches
65
 
66
  return processed_results
 
1
  import asyncio
2
+ import aiohttp
3
  import logging
4
  from typing import List, Dict, Any
5
  from services.booking_service import BookingService
 
17
  """Scrape multiple hotels concurrently"""
18
  logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
19
 
20
+ # Configure client session with optimal settings for proxy use
21
  connector = aiohttp.TCPConnector(
22
+ limit=3, # Limit concurrent connections to avoid proxy overload
23
+ ttl_dns_cache=300, # Cache DNS results
24
+ force_close=True, # Always close connections
25
+ enable_cleanup_closed=True, # Clean up closed connections
26
  )
27
 
28
+ timeout = aiohttp.ClientTimeout(total=90) # Longer timeout for proxies
29
 
30
  async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
 
 
 
 
 
 
 
 
 
31
  # Process hotels in small batches to avoid overloading proxies
32
  processed_results = []
33
+ chunk_size = 1 # Process one hotel at a time for better reliability
34
 
35
+ for i in range(0, len(hotel_queries), chunk_size):
36
+ chunk_queries = hotel_queries[i:i+chunk_size]
37
+
38
+ # Create tasks for this chunk
39
+ tasks = []
40
+ for query in chunk_queries:
41
+ task = asyncio.create_task(self.booking_service.search_hotel(
42
+ session=session,
43
+ destination=query.destination,
44
+ hotel_name=query.hotel_name
45
+ ))
46
+ tasks.append(task)
47
+
48
+ # Wait for all tasks in this chunk to complete
49
+ chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
50
 
51
+ # Process results from this chunk
52
  for j, result in enumerate(chunk_results):
53
  query_index = i + j
54
+ if isinstance(result, Exception):
55
+ logger.error(f"Error scraping {hotel_queries[query_index].hotel_name}: {result}")
56
+ processed_results.append({
57
+ "destination": hotel_queries[query_index].destination,
58
+ "hotel_name": hotel_queries[query_index].hotel_name,
59
+ "error": f"Scraping failed: {str(result)}"
60
+ })
61
+ else:
62
+ processed_results.append(result)
 
63
 
64
+ # Add delay between chunks (important for proxy rotation)
65
+ if i + chunk_size < len(hotel_queries):
66
+ await asyncio.sleep(2)
67
 
68
  return processed_results