import re import logging import random from typing import Optional, Dict, Any, List from urllib.parse import urljoin, quote logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def get_clean_text(element) -> str: """Extract clean text from an HTML element""" if element: return element.text.strip() return "" def clean_url(base_url: str, href: str) -> str: """Clean and join URLs properly""" if not href: return "" return urljoin(base_url, href) def extract_float_from_text(text: str, default: Optional[float] = None) -> Optional[float]: """Extract a float value from text""" if not text: return default match = re.search(r'(\d+[\.,]?\d*)', text) if match: try: return float(match.group(1).replace(',', '.')) except ValueError: pass return default def construct_booking_search_url(destination: str, hotel_name: Optional[str] = None) -> str: """Construct a Booking.com search URL""" search_query = f"{hotel_name} {destination}" if hotel_name else destination return f"https://www.booking.com/search.html?ss={quote(search_query)}" def is_valid_image_url(url: str) -> bool: """Check if URL is likely a valid room image and not a logo""" if not url: return False if url.startswith("data:"): return False # Skip tiny images (likely icons) if any(x in url for x in ["icon", "logo", "badge", "thumb"]): return False # Must be a full URL if not (url.startswith("http://") or url.startswith("https://")): return False return True