"""Utility functions for the Hotel Search App.""" import re from urllib.parse import urlparse BLOCKED_DOMAINS = { "expedia.com", "booking.com", "hotels.com", "trivago.com", "kayak.com", "priceline.com", "orbitz.com", "travelocity.com", "agoda.com", "trip.com", "hotwire.com", "cheaptickets.com", "tripadvisor.com", "google.com", "bing.com", "momondo.com", "skyscanner.com", "makemytrip.com", "goibibo.com", "yatra.com", "cleartrip.com", "lonelyplanet.com", "hostelworld.com", "hotels.ng", "hrs.com", "destinia.com", "travelzoo.com", "smartertravel.com", "travelpod.com", "wotif.com", "lastminute.com", "opodo.com", "edreams.com", "loveholidays.com", "secretescapes.com", "hotelscombined.com", "travelsupermarket.com", "skyscanner.net", "cheapoair.com", "onetravel.com", "getaroom.com", "snaptravel.com", } def is_travel_agency(url: str) -> bool: """Return True if the URL belongs to a known travel agency or aggregator.""" try: parsed = urlparse(url) domain = parsed.netloc.lower().replace("www.", "") for blocked in BLOCKED_DOMAINS: if blocked in domain: return True return False except Exception: return False def extract_direct_hotel_url(urls: list[str]) -> str | None: """From a list of URLs, return the first one that is NOT a travel agency.""" for url in urls: if url and not is_travel_agency(url): return url return None def extract_price_from_text(text: str) -> float | None: """Try to extract a dollar price from a text string.""" patterns = [ r"\$\s?(\d{1,5}(?:\.\d{2})?)", r"(\d{1,5})\s*(?:dollars|usd|per night|/night|a night)", ] for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE) if match: try: return float(match.group(1)) except ValueError: continue return None def clean_snippet(text: str) -> str: """Clean up a search result snippet.""" if not text: return "" text = re.sub(r"<[^>]+>", "", text) text = re.sub(r"\s+", " ", text).strip() return text