Spaces:

garvitcpp
/

accomodation-info-api

Paused

File size: 15,175 Bytes

28df1e8

import asyncio
import random
import re
import aiohttp # type: ignore
from bs4 import BeautifulSoup # type: ignore
from typing import Dict, Any, List, Optional, Tuple
import logging
from urllib.parse import urljoin, quote
from dotenv import load_dotenv
import os

from .utils.http_utils import fetch_page
from .utils.image_utils import filter_logo_images, is_logo_image
from .utils.html_utils import extract_rating_from_element, extract_images_from_soup
from .utils.google_search_utils import fetch_hotel_images_from_google

# Load environment variables
load_dotenv()

logger = logging.getLogger(__name__)

class BookingService:
    """Service for scraping hotel data from Booking.com"""
    
    def __init__(self):
        # List of diverse user agents for rotation
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
            "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
            "Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/123.0.0.0 Mobile/15E148 Safari/604.1",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
        ]
        
        # Base headers - User-Agent will be overridden in get_page
        self.headers = {
            "Accept-Language": "en-US,en;q=0.9",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "sec-ch-ua": '"Google Chrome";v="123", "Not:A-Brand";v="99"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"Windows"',
        }
        
        # Check if Google Search API credentials are available
        self.google_api_available = bool(os.getenv("GOOGLE_SEARCH_API_KEY") and os.getenv("GOOGLE_SEARCH_ENGINE_ID"))
        
        logger.info(f"BookingService initialized at 2025-05-21 15:22:38 by Garvit-Nagok")
        if self.google_api_available:
            logger.info("Google Custom Search API configured as fallback for hotel images")
        else:
            logger.warning("Google Custom Search API credentials not found - fallback will not be available")

    # [Keep all existing methods unchanged]
    
    async def get_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
        """Wrapper for fetch_page with rotating user agents"""
        # Create a copy of headers and use a random user agent
        current_headers = self.headers.copy()
        current_headers["User-Agent"] = random.choice(self.user_agents)
        
        logger.debug(f"Using user agent: {current_headers['User-Agent'][:30]}...")
        return await fetch_page(session, url, current_headers)
    
    async def extract_amenities(self, session: aiohttp.ClientSession, hotel_element, hotel_url: Optional[str] = None) -> List[str]:
        """Extract popular facilities from hotel detail page"""
        unique_amenities = set()
        
        if hotel_url:
            try:
                html = await self.get_page(session, hotel_url)
                if html:
                    soup = BeautifulSoup(html, 'html.parser')
                    
                    popular_heading = soup.find(string=lambda text: text and text.strip() == "Most popular facilities")
                    
                    if popular_heading:
                        current = popular_heading.parent
                        container = None
                        
                        # Look for container with facility icons
                        for _ in range(3):
                            if not current:
                                break
                                
                            if current.select("svg") or current.select("img"):
                                container = current
                                break
                                
                            parent = current.parent
                            if parent and (parent.select("svg") or parent.select("img")):
                                container = parent
                                break
                            
                            sibling = current.find_next_sibling()
                            if sibling and (sibling.select("svg") or sibling.select("img")):
                                container = sibling
                                break
                                
                            current = parent
                        
                        if not container:
                            heading_parent = popular_heading.parent
                            if heading_parent:
                                container = heading_parent.find_next_sibling()
                        
                        # Extract facility items
                        if container:
                            facility_items = container.select("span") or container.select("div")
                            
                            for item in facility_items:
                                text = item.get_text().strip()
                                if text and text != "Most popular facilities" and len(text) < 30:
                                    unique_amenities.add(text)
                        
                        # Fallback method
                        if not unique_amenities:
                            try:
                                rows = soup.select(".f6b6d2a959") or soup.select_one("div:-soup-contains('Most popular facilities')").parent.find_next_sibling().select("span")
                                
                                for item in rows:
                                    text = item.get_text().strip()
                                    if text and text != "Most popular facilities" and len(text) < 30:
                                        unique_amenities.add(text)
                            except AttributeError:
                                logger.debug("Could not find facilities using fallback selector")
            except Exception as e:
                logger.error(f"Error extracting amenities: {e}")
        
        return list(unique_amenities)
    
    async def get_room_images_from_detail_page(self, session: aiohttp.ClientSession, url: str) -> List[str]:
        """Get a mix of property and room images from hotel detail page"""
        all_images = []
        
        try:
            html = await self.get_page(session, url)
            if html:
                soup = BeautifulSoup(html, 'html.parser')
                
                selectors = [
                    ".bui-carousel__item img", ".bh-photo-grid img",
                    ".hp-gallery img", ".hotel-photos img",
                    ".room-gallery img", ".hotel-room-photographs-slides img",
                    "img.active-image", ".gallery-mosaic img", ".tour-360__image img",
                    "img[width='300'], img[width='350'], img[width='400'], img[width='500']",
                ]
                
                all_images = extract_images_from_soup(soup, url, selectors)
                
                if len(all_images) < 5:
                    for img in soup.select("img"):
                        width = img.get("width")
                        if width and int(width) < 100:
                            continue
                            
                        src = img.get("src") or img.get("data-src")
                        if src and not is_logo_image(src) and src not in all_images:
                            if not src.startswith("http"):
                                src = urljoin(url, src)
                            all_images.append(src)
                            if len(all_images) >= 5:
                                break
                
                return filter_logo_images(all_images)[:5]
                
        except Exception as e:
            logger.error(f"Error getting hotel images: {e}", exc_info=True)
            
        return all_images[:5] if all_images else []
    
    async def extract_rating_from_detail_page(self, session: aiohttp.ClientSession, url: str) -> Optional[float]:
        """Extract rating from hotel detail page"""
        try:
            html = await self.get_page(session, url)
            if not html:
                return None
            
            soup = BeautifulSoup(html, 'html.parser')
            
            guest_reviews_section = soup.find("h2", string="Guest reviews")
            if guest_reviews_section:
                rating_div = soup.select_one("div[aria-label*='Scored'] strong") or soup.select_one(".b5cd09854e") 
                if rating_div:
                    text = rating_div.get_text().strip()
                    match = re.search(r"(\d+[.,]\d+)", text)
                    if match:
                        return float(match.group(1).replace(',', '.'))
                
                nearby_elements = guest_reviews_section.parent.select("div")
                for elem in nearby_elements:
                    text = elem.get_text().strip()
                    if re.match(r"^\d+[.,]\d+$", text):
                        return float(text.replace(',', '.'))
            
            score_elements = soup.select(".review-score-badge, .b5cd09854e")
            for elem in score_elements:
                text = elem.get_text().strip()
                match = re.search(r"(\d+[.,]\d+)", text)
                if match:
                    return float(match.group(1).replace(',', '.'))
            
            review_text = soup.find(string=lambda text: text and ("Review score" in text))
            if review_text:
                parent_text = review_text.parent.get_text() if review_text.parent else ""
                match = re.search(r"(\d+[.,]\d+)", parent_text)
                if match:
                    return float(match.group(1).replace(',', '.'))
                    
        except Exception as e:
            logger.error(f"Error extracting rating: {e}")
        
        return None
    
    def extract_rating(self, hotel_element) -> Optional[float]:
        """Extract rating from hotel element"""
        return extract_rating_from_element(hotel_element)
    
    def is_name_similar(self, name1: str, name2: str) -> bool:
        """Check if two hotel names are similar enough"""
        if not name1 or not name2:
            return False
            
        name1 = name1.lower()
        name2 = name2.lower()
        
        if name1 in name2 or name2 in name1:
            return True
        
        # Compare words
        words1 = set(re.findall(r'\w+', name1))
        words2 = set(re.findall(r'\w+', name2))
        
        if not words1 or not words2:
            return False
        
        # Calculate word overlap
        common_words = words1.intersection(words2)
        similarity = len(common_words) / min(len(words1), len(words2))
        
        return similarity >= 0.5  # 50% word overlap
    
    async def search_hotel(self, session: aiohttp.ClientSession, destination: str, hotel_name: str) -> Dict[str, Any]:
        """Search for a specific hotel on Booking.com"""
        search_query = f"{hotel_name} {destination}"
        search_url = f"https://www.booking.com/search.html?ss={quote(search_query)}"
        
        html = await self.get_page(session, search_url)
        
        if not html:
            return {
                "destination": destination,
                "hotel_name": hotel_name,
                "error": "Failed to retrieve search results"
            }
        
        soup = BeautifulSoup(html, 'html.parser')
        hotel_cards = soup.select("[data-testid='property-card'], .sr_property_block, .sr_item")
        
        if not hotel_cards:
            return {
                "destination": destination,
                "hotel_name": hotel_name,
                "error": "No hotels found"
            }
        
        # Find matching hotel card
        hotel_card = None
        for card in hotel_cards:
            name_elem = card.select_one("[data-testid='title'], .sr-hotel__name, .hotel_name")
            if name_elem:
                card_hotel_name = name_elem.text.strip()
                if self.is_name_similar(card_hotel_name, hotel_name):
                    hotel_card = card
                    break
        
        if not hotel_card:
            hotel_card = hotel_cards[0]
        
        name_elem = hotel_card.select_one("[data-testid='title'], .sr-hotel__name, .hotel_name")
        name = name_elem.text.strip() if name_elem else hotel_name
        rating = self.extract_rating(hotel_card)

        link_elem = hotel_card.select_one("a[href*='hotel'], a.hotel_name_link")
        hotel_url = ""
        if link_elem and 'href' in link_elem.attrs:
            href = link_elem['href']
            hotel_url = urljoin("https://www.booking.com", href) if not href.startswith('http') else href

        if hotel_url:
            tasks = [
                self.extract_rating_from_detail_page(session, hotel_url),
                self.get_room_images_from_detail_page(session, hotel_url),
                self.extract_amenities(session, hotel_card, hotel_url)
            ]
            
            detail_rating, images, amenities = await asyncio.gather(*tasks)
            
            if detail_rating is not None:
                rating = detail_rating
        else:
            images = []
            amenities = []
            
        # If scraping didn't return any images, use Google Custom Search API as fallback
        if not images and self.google_api_available:
            logger.info(f"No images found via scraping for {hotel_name} in {destination}. Using Google API as fallback.")
            images = await fetch_hotel_images_from_google(session, hotel_name, destination)
        
        return {
            "destination": destination,
            "hotel_name": hotel_name,
            "data": {
                "name": name,
                "rating": rating,
                "images": images,
                "amenities": amenities,
                "booking_link": hotel_url
            }
        }