File size: 1,790 Bytes
28df1e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
import logging
import random
from typing import Optional, Dict, Any, List
from urllib.parse import urljoin, quote

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def get_clean_text(element) -> str:
    """Extract clean text from an HTML element"""
    if element:
        return element.text.strip()
    return ""

def clean_url(base_url: str, href: str) -> str:
    """Clean and join URLs properly"""
    if not href:
        return ""
    return urljoin(base_url, href)

def extract_float_from_text(text: str, default: Optional[float] = None) -> Optional[float]:
    """Extract a float value from text"""
    if not text:
        return default
    
    match = re.search(r'(\d+[\.,]?\d*)', text)
    if match:
        try:
            return float(match.group(1).replace(',', '.'))
        except ValueError:
            pass
    return default

def construct_booking_search_url(destination: str, hotel_name: Optional[str] = None) -> str:
    """Construct a Booking.com search URL"""
    search_query = f"{hotel_name} {destination}" if hotel_name else destination
    return f"https://www.booking.com/search.html?ss={quote(search_query)}"

def is_valid_image_url(url: str) -> bool:
    """Check if URL is likely a valid room image and not a logo"""
    if not url:
        return False
    
    if url.startswith("data:"):
        return False
    
    # Skip tiny images (likely icons)
    if any(x in url for x in ["icon", "logo", "badge", "thumb"]):
        return False
    
    # Must be a full URL
    if not (url.startswith("http://") or url.startswith("https://")):
        return False
    
    return True