| """Utility functions for the Hotel Search App.""" |
|
|
| import re |
| from urllib.parse import urlparse |
|
|
| BLOCKED_DOMAINS = { |
| "expedia.com", "booking.com", "hotels.com", "trivago.com", |
| "kayak.com", "priceline.com", "orbitz.com", "travelocity.com", |
| "agoda.com", "trip.com", "hotwire.com", "cheaptickets.com", |
| "tripadvisor.com", "google.com", "bing.com", "momondo.com", |
| "skyscanner.com", "makemytrip.com", "goibibo.com", "yatra.com", |
| "cleartrip.com", "lonelyplanet.com", "hostelworld.com", |
| "hotels.ng", "hrs.com", "destinia.com", |
| "travelzoo.com", "smartertravel.com", "travelpod.com", |
| "wotif.com", "lastminute.com", "opodo.com", "edreams.com", |
| "loveholidays.com", "secretescapes.com", "hotelscombined.com", |
| "travelsupermarket.com", "skyscanner.net", "cheapoair.com", |
| "onetravel.com", "getaroom.com", "snaptravel.com", |
| } |
|
|
|
|
| def is_travel_agency(url: str) -> bool: |
| """Return True if the URL belongs to a known travel agency or aggregator.""" |
| try: |
| parsed = urlparse(url) |
| domain = parsed.netloc.lower().replace("www.", "") |
| for blocked in BLOCKED_DOMAINS: |
| if blocked in domain: |
| return True |
| return False |
| except Exception: |
| return False |
|
|
|
|
| def extract_direct_hotel_url(urls: list[str]) -> str | None: |
| """From a list of URLs, return the first one that is NOT a travel agency.""" |
| for url in urls: |
| if url and not is_travel_agency(url): |
| return url |
| return None |
|
|
|
|
| def extract_price_from_text(text: str) -> float | None: |
| """Try to extract a dollar price from a text string.""" |
| patterns = [ |
| r"\$\s?(\d{1,5}(?:\.\d{2})?)", |
| r"(\d{1,5})\s*(?:dollars|usd|per night|/night|a night)", |
| ] |
| for pattern in patterns: |
| match = re.search(pattern, text, re.IGNORECASE) |
| if match: |
| try: |
| return float(match.group(1)) |
| except ValueError: |
| continue |
| return None |
|
|
|
|
| def clean_snippet(text: str) -> str: |
| """Clean up a search result snippet.""" |
| if not text: |
| return "" |
| text = re.sub(r"<[^>]+>", "", text) |
| text = re.sub(r"\s+", " ", text).strip() |
| return text |
|
|