Spaces:

garvitcpp
/

accomodation-info-api

Paused

App Files Files Community

accomodation-info-api / services /utils /html_utils.py

garvitcpp

Upload 27 files

28df1e8 verified 11 months ago

raw

history blame contribute delete

2.03 kB

	from bs4 import BeautifulSoup # type: ignore
	from typing import List, Optional
	from urllib.parse import urljoin
	import re
	import logging
	from .image_utils import is_logo_image

	logger = logging.getLogger(__name__)

	def extract_rating_from_element(element) -> Optional[float]:
	"""Extract rating from an HTML element"""
	try:
	rating_elem = element.select_one(".bui-review-score__badge") or element.select_one("[data-testid='review-score']")

	if rating_elem:
	rating_text = rating_elem.text.strip()
	rating_match = re.search(r"(\d+[.,]?\d*)", rating_text)
	if rating_match:
	rating_value = float(rating_match.group(1).replace(',', '.'))
	return round(rating_value, 1)

	# Look for review text near ratings
	review_container = element.select_one(".bui-review-score, .d10a6220b4")
	if review_container:
	text = review_container.get_text()
	rating_match = re.search(r"(\d+[.,]\d+)", text)
	if rating_match:
	rating_value = float(rating_match.group(1).replace(',', '.'))
	return round(rating_value, 1)
	except Exception as e:
	logger.error(f"Error extracting rating: {e}")

	return None

	def extract_images_from_soup(soup: BeautifulSoup, url: str, selectors: List[str], max_images: int = 5) -> List[str]:
	"""Extract images from HTML using provided selectors"""
	images = []

	for selector in selectors:
	for img in soup.select(selector):
	src = img.get("src") or img.get("data-src") or img.get("data-lazy-src")
	if src and not is_logo_image(src):
	if not src.startswith("http"):
	src = urljoin(url, src)

	if src not in images:
	images.append(src)
	if len(images) >= max_images:
	return images

	return images