Spaces:

cespin24
/

Hotel

Sleeping

App Files Files Community

Hotel / utils.py

cespin24

Upload 10 files

a98dd63 verified 29 days ago

raw

history blame contribute delete

2.2 kB

	"""Utility functions for the Hotel Search App."""

	import re
	from urllib.parse import urlparse

	BLOCKED_DOMAINS = {
	"expedia.com", "booking.com", "hotels.com", "trivago.com",
	"kayak.com", "priceline.com", "orbitz.com", "travelocity.com",
	"agoda.com", "trip.com", "hotwire.com", "cheaptickets.com",
	"tripadvisor.com", "google.com", "bing.com", "momondo.com",
	"skyscanner.com", "makemytrip.com", "goibibo.com", "yatra.com",
	"cleartrip.com", "lonelyplanet.com", "hostelworld.com",
	"hotels.ng", "hrs.com", "destinia.com",
	"travelzoo.com", "smartertravel.com", "travelpod.com",
	"wotif.com", "lastminute.com", "opodo.com", "edreams.com",
	"loveholidays.com", "secretescapes.com", "hotelscombined.com",
	"travelsupermarket.com", "skyscanner.net", "cheapoair.com",
	"onetravel.com", "getaroom.com", "snaptravel.com",
	}


	def is_travel_agency(url: str) -> bool:
	"""Return True if the URL belongs to a known travel agency or aggregator."""
	try:
	parsed = urlparse(url)
	domain = parsed.netloc.lower().replace("www.", "")
	for blocked in BLOCKED_DOMAINS:
	if blocked in domain:
	return True
	return False
	except Exception:
	return False


	def extract_direct_hotel_url(urls: list[str]) -> str \| None:
	"""From a list of URLs, return the first one that is NOT a travel agency."""
	for url in urls:
	if url and not is_travel_agency(url):
	return url
	return None


	def extract_price_from_text(text: str) -> float \| None:
	"""Try to extract a dollar price from a text string."""
	patterns = [
	r"\$\s?(\d{1,5}(?:\.\d{2})?)",
	r"(\d{1,5})\s*(?:dollars\|usd\|per night\|/night\|a night)",
	]
	for pattern in patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	try:
	return float(match.group(1))
	except ValueError:
	continue
	return None


	def clean_snippet(text: str) -> str:
	"""Clean up a search result snippet."""
	if not text:
	return ""
	text = re.sub(r"<[^>]+>", "", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text