garvitcpp's picture
Upload 27 files
28df1e8 verified
raw
history blame
1.79 kB
import re
import logging
import random
from typing import Optional, Dict, Any, List
from urllib.parse import urljoin, quote
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def get_clean_text(element) -> str:
"""Extract clean text from an HTML element"""
if element:
return element.text.strip()
return ""
def clean_url(base_url: str, href: str) -> str:
"""Clean and join URLs properly"""
if not href:
return ""
return urljoin(base_url, href)
def extract_float_from_text(text: str, default: Optional[float] = None) -> Optional[float]:
"""Extract a float value from text"""
if not text:
return default
match = re.search(r'(\d+[\.,]?\d*)', text)
if match:
try:
return float(match.group(1).replace(',', '.'))
except ValueError:
pass
return default
def construct_booking_search_url(destination: str, hotel_name: Optional[str] = None) -> str:
"""Construct a Booking.com search URL"""
search_query = f"{hotel_name} {destination}" if hotel_name else destination
return f"https://www.booking.com/search.html?ss={quote(search_query)}"
def is_valid_image_url(url: str) -> bool:
"""Check if URL is likely a valid room image and not a logo"""
if not url:
return False
if url.startswith("data:"):
return False
# Skip tiny images (likely icons)
if any(x in url for x in ["icon", "logo", "badge", "thumb"]):
return False
# Must be a full URL
if not (url.startswith("http://") or url.startswith("https://")):
return False
return True