| import re
|
| import logging
|
| import random
|
| from typing import Optional, Dict, Any, List
|
| from urllib.parse import urljoin, quote
|
|
|
| logging.basicConfig(
|
| level=logging.INFO,
|
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| )
|
| logger = logging.getLogger(__name__)
|
|
|
| def get_clean_text(element) -> str:
|
| """Extract clean text from an HTML element"""
|
| if element:
|
| return element.text.strip()
|
| return ""
|
|
|
| def clean_url(base_url: str, href: str) -> str:
|
| """Clean and join URLs properly"""
|
| if not href:
|
| return ""
|
| return urljoin(base_url, href)
|
|
|
| def extract_float_from_text(text: str, default: Optional[float] = None) -> Optional[float]:
|
| """Extract a float value from text"""
|
| if not text:
|
| return default
|
|
|
| match = re.search(r'(\d+[\.,]?\d*)', text)
|
| if match:
|
| try:
|
| return float(match.group(1).replace(',', '.'))
|
| except ValueError:
|
| pass
|
| return default
|
|
|
| def construct_booking_search_url(destination: str, hotel_name: Optional[str] = None) -> str:
|
| """Construct a Booking.com search URL"""
|
| search_query = f"{hotel_name} {destination}" if hotel_name else destination
|
| return f"https://www.booking.com/search.html?ss={quote(search_query)}"
|
|
|
| def is_valid_image_url(url: str) -> bool:
|
| """Check if URL is likely a valid room image and not a logo"""
|
| if not url:
|
| return False
|
|
|
| if url.startswith("data:"):
|
| return False
|
|
|
|
|
| if any(x in url for x in ["icon", "logo", "badge", "thumb"]):
|
| return False
|
|
|
|
|
| if not (url.startswith("http://") or url.startswith("https://")):
|
| return False
|
|
|
| return True |