garvitcpp commited on
Commit
737ef25
·
verified ·
1 Parent(s): 999991a

Update core/scraper.py

Browse files
Files changed (1) hide show
  1. core/scraper.py +14 -14
core/scraper.py CHANGED
@@ -19,11 +19,10 @@ class HotelScraper:
19
 
20
  # Use ClientSession with TCP connector options for better reliability
21
  connector = aiohttp.TCPConnector(
22
- limit=5, # Limit number of simultaneous connections
23
  ttl_dns_cache=300, # Cache DNS results for 5 minutes
24
- enable_cleanup_closed=True, # Clean up closed connections
25
- force_close=True, # Force close connections
26
- ssl=False # Some proxies don't support SSL
27
  )
28
 
29
  timeout = aiohttp.ClientTimeout(total=60) # Longer timeout for proxies
@@ -31,26 +30,27 @@ class HotelScraper:
31
  async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
32
  tasks = []
33
  for query in hotel_queries:
34
- task = self.booking_service.search_hotel(
35
  session=session,
36
  destination=query.destination,
37
  hotel_name=query.hotel_name
38
- )
39
  tasks.append(task)
40
 
41
- # Run all tasks concurrently with a limit to avoid overloading proxies
42
  processed_results = []
43
- chunk_size = 3 # Process 3 hotels at a time
44
 
45
  for i in range(0, len(tasks), chunk_size):
46
  chunk = tasks[i:i+chunk_size]
47
- results = await asyncio.gather(*chunk, return_exceptions=True)
48
 
49
- for j, result in enumerate(results):
 
50
  query_index = i + j
51
- if query_index < len(hotel_queries): # Safety check
52
  if isinstance(result, Exception):
53
- logger.error(f"Error scraping hotel {hotel_queries[query_index].hotel_name}: {result}")
54
  processed_results.append({
55
  "destination": hotel_queries[query_index].destination,
56
  "hotel_name": hotel_queries[query_index].hotel_name,
@@ -59,8 +59,8 @@ class HotelScraper:
59
  else:
60
  processed_results.append(result)
61
 
62
- # Small delay between chunks to be gentle on proxies
63
  if i + chunk_size < len(tasks):
64
- await asyncio.sleep(1)
65
 
66
  return processed_results
 
19
 
20
  # Use ClientSession with TCP connector options for better reliability
21
  connector = aiohttp.TCPConnector(
22
+ limit=5, # Limit number of simultaneous connections
23
  ttl_dns_cache=300, # Cache DNS results for 5 minutes
24
+ force_close=True, # Force close connections after each use
25
+ ssl=False # Some proxies don't support SSL
 
26
  )
27
 
28
  timeout = aiohttp.ClientTimeout(total=60) # Longer timeout for proxies
 
30
  async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
31
  tasks = []
32
  for query in hotel_queries:
33
+ task = asyncio.create_task(self.booking_service.search_hotel(
34
  session=session,
35
  destination=query.destination,
36
  hotel_name=query.hotel_name
37
+ ))
38
  tasks.append(task)
39
 
40
+ # Process hotels in small batches to avoid overloading proxies
41
  processed_results = []
42
+ chunk_size = 2 # Process 2 hotels at a time
43
 
44
  for i in range(0, len(tasks), chunk_size):
45
  chunk = tasks[i:i+chunk_size]
46
+ chunk_results = await asyncio.gather(*chunk, return_exceptions=True)
47
 
48
+ # Process results
49
+ for j, result in enumerate(chunk_results):
50
  query_index = i + j
51
+ if query_index < len(hotel_queries):
52
  if isinstance(result, Exception):
53
+ logger.error(f"Error scraping {hotel_queries[query_index].hotel_name}: {result}")
54
  processed_results.append({
55
  "destination": hotel_queries[query_index].destination,
56
  "hotel_name": hotel_queries[query_index].hotel_name,
 
59
  else:
60
  processed_results.append(result)
61
 
62
+ # Add delay between chunks
63
  if i + chunk_size < len(tasks):
64
+ await asyncio.sleep(2) # 2 second delay between batches
65
 
66
  return processed_results