| from bs4 import BeautifulSoup
|
| from typing import List, Optional
|
| from urllib.parse import urljoin
|
| import re
|
| import logging
|
| from .image_utils import is_logo_image
|
|
|
| logger = logging.getLogger(__name__)
|
|
|
| def extract_rating_from_element(element) -> Optional[float]:
|
| """Extract rating from an HTML element"""
|
| try:
|
| rating_elem = element.select_one(".bui-review-score__badge") or element.select_one("[data-testid='review-score']")
|
|
|
| if rating_elem:
|
| rating_text = rating_elem.text.strip()
|
| rating_match = re.search(r"(\d+[.,]?\d*)", rating_text)
|
| if rating_match:
|
| rating_value = float(rating_match.group(1).replace(',', '.'))
|
| return round(rating_value, 1)
|
|
|
|
|
| review_container = element.select_one(".bui-review-score, .d10a6220b4")
|
| if review_container:
|
| text = review_container.get_text()
|
| rating_match = re.search(r"(\d+[.,]\d+)", text)
|
| if rating_match:
|
| rating_value = float(rating_match.group(1).replace(',', '.'))
|
| return round(rating_value, 1)
|
| except Exception as e:
|
| logger.error(f"Error extracting rating: {e}")
|
|
|
| return None
|
|
|
| def extract_images_from_soup(soup: BeautifulSoup, url: str, selectors: List[str], max_images: int = 5) -> List[str]:
|
| """Extract images from HTML using provided selectors"""
|
| images = []
|
|
|
| for selector in selectors:
|
| for img in soup.select(selector):
|
| src = img.get("src") or img.get("data-src") or img.get("data-lazy-src")
|
| if src and not is_logo_image(src):
|
| if not src.startswith("http"):
|
| src = urljoin(url, src)
|
|
|
| if src not in images:
|
| images.append(src)
|
| if len(images) >= max_images:
|
| return images
|
|
|
| return images |