from bs4 import BeautifulSoup # type: ignore from typing import List, Optional from urllib.parse import urljoin import re import logging from .image_utils import is_logo_image logger = logging.getLogger(__name__) def extract_rating_from_element(element) -> Optional[float]: """Extract rating from an HTML element""" try: rating_elem = element.select_one(".bui-review-score__badge") or element.select_one("[data-testid='review-score']") if rating_elem: rating_text = rating_elem.text.strip() rating_match = re.search(r"(\d+[.,]?\d*)", rating_text) if rating_match: rating_value = float(rating_match.group(1).replace(',', '.')) return round(rating_value, 1) # Look for review text near ratings review_container = element.select_one(".bui-review-score, .d10a6220b4") if review_container: text = review_container.get_text() rating_match = re.search(r"(\d+[.,]\d+)", text) if rating_match: rating_value = float(rating_match.group(1).replace(',', '.')) return round(rating_value, 1) except Exception as e: logger.error(f"Error extracting rating: {e}") return None def extract_images_from_soup(soup: BeautifulSoup, url: str, selectors: List[str], max_images: int = 5) -> List[str]: """Extract images from HTML using provided selectors""" images = [] for selector in selectors: for img in soup.select(selector): src = img.get("src") or img.get("data-src") or img.get("data-lazy-src") if src and not is_logo_image(src): if not src.startswith("http"): src = urljoin(url, src) if src not in images: images.append(src) if len(images) >= max_images: return images return images