garvitcpp's picture
Upload 27 files
28df1e8 verified
import re
import logging
import random
from typing import Optional, Dict, Any, List
from urllib.parse import urljoin, quote
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def get_clean_text(element) -> str:
"""Extract clean text from an HTML element"""
if element:
return element.text.strip()
return ""
def clean_url(base_url: str, href: str) -> str:
"""Clean and join URLs properly"""
if not href:
return ""
return urljoin(base_url, href)
def extract_float_from_text(text: str, default: Optional[float] = None) -> Optional[float]:
"""Extract a float value from text"""
if not text:
return default
match = re.search(r'(\d+[\.,]?\d*)', text)
if match:
try:
return float(match.group(1).replace(',', '.'))
except ValueError:
pass
return default
def construct_booking_search_url(destination: str, hotel_name: Optional[str] = None) -> str:
"""Construct a Booking.com search URL"""
search_query = f"{hotel_name} {destination}" if hotel_name else destination
return f"https://www.booking.com/search.html?ss={quote(search_query)}"
def is_valid_image_url(url: str) -> bool:
"""Check if URL is likely a valid room image and not a logo"""
if not url:
return False
if url.startswith("data:"):
return False
# Skip tiny images (likely icons)
if any(x in url for x in ["icon", "logo", "badge", "thumb"]):
return False
# Must be a full URL
if not (url.startswith("http://") or url.startswith("https://")):
return False
return True