garvitcpp commited on
Commit
23f8688
·
verified ·
1 Parent(s): 675dcd6

Update core/scraper.py

Browse files
Files changed (1) hide show
  1. core/scraper.py +36 -17
core/scraper.py CHANGED
@@ -1,7 +1,6 @@
1
  import asyncio
2
- import aiohttp # type: ignore
3
  import logging
4
- import random
5
  from typing import List, Dict, Any
6
  from services.booking_service import BookingService
7
  from models.requests import HotelQuery
@@ -18,7 +17,18 @@ class HotelScraper:
18
  """Scrape multiple hotels concurrently"""
19
  logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
20
 
21
- async with aiohttp.ClientSession() as session:
 
 
 
 
 
 
 
 
 
 
 
22
  tasks = []
23
  for query in hotel_queries:
24
  task = self.booking_service.search_hotel(
@@ -28,20 +38,29 @@ class HotelScraper:
28
  )
29
  tasks.append(task)
30
 
31
- # Run all tasks concurrently
32
- results = await asyncio.gather(*tasks, return_exceptions=True)
33
-
34
- # Handle any exceptions
35
  processed_results = []
36
- for i, result in enumerate(results):
37
- if isinstance(result, Exception):
38
- logger.error(f"Error scraping hotel {hotel_queries[i].hotel_name}: {result}")
39
- processed_results.append({
40
- "destination": hotel_queries[i].destination,
41
- "hotel_name": hotel_queries[i].hotel_name,
42
- "error": f"Scraping failed: {str(result)}"
43
- })
44
- else:
45
- processed_results.append(result)
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  return processed_results
 
1
  import asyncio
2
+ import aiohttp # type: ignore
3
  import logging
 
4
  from typing import List, Dict, Any
5
  from services.booking_service import BookingService
6
  from models.requests import HotelQuery
 
17
  """Scrape multiple hotels concurrently"""
18
  logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
19
 
20
+ # Use ClientSession with TCP connector options for better reliability
21
+ connector = aiohttp.TCPConnector(
22
+ limit=5, # Limit number of simultaneous connections
23
+ ttl_dns_cache=300, # Cache DNS results for 5 minutes
24
+ enable_cleanup_closed=True, # Clean up closed connections
25
+ force_close=True, # Force close connections
26
+ ssl=False # Some proxies don't support SSL
27
+ )
28
+
29
+ timeout = aiohttp.ClientTimeout(total=60) # Longer timeout for proxies
30
+
31
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
32
  tasks = []
33
  for query in hotel_queries:
34
  task = self.booking_service.search_hotel(
 
38
  )
39
  tasks.append(task)
40
 
41
+ # Run all tasks concurrently with a limit to avoid overloading proxies
 
 
 
42
  processed_results = []
43
+ chunk_size = 3 # Process 3 hotels at a time
44
+
45
+ for i in range(0, len(tasks), chunk_size):
46
+ chunk = tasks[i:i+chunk_size]
47
+ results = await asyncio.gather(*chunk, return_exceptions=True)
48
+
49
+ for j, result in enumerate(results):
50
+ query_index = i + j
51
+ if query_index < len(hotel_queries): # Safety check
52
+ if isinstance(result, Exception):
53
+ logger.error(f"Error scraping hotel {hotel_queries[query_index].hotel_name}: {result}")
54
+ processed_results.append({
55
+ "destination": hotel_queries[query_index].destination,
56
+ "hotel_name": hotel_queries[query_index].hotel_name,
57
+ "error": f"Scraping failed: {str(result)}"
58
+ })
59
+ else:
60
+ processed_results.append(result)
61
+
62
+ # Small delay between chunks to be gentle on proxies
63
+ if i + chunk_size < len(tasks):
64
+ await asyncio.sleep(1)
65
 
66
  return processed_results