File size: 2,031 Bytes
28df1e8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | from bs4 import BeautifulSoup # type: ignore
from typing import List, Optional
from urllib.parse import urljoin
import re
import logging
from .image_utils import is_logo_image
logger = logging.getLogger(__name__)
def extract_rating_from_element(element) -> Optional[float]:
"""Extract rating from an HTML element"""
try:
rating_elem = element.select_one(".bui-review-score__badge") or element.select_one("[data-testid='review-score']")
if rating_elem:
rating_text = rating_elem.text.strip()
rating_match = re.search(r"(\d+[.,]?\d*)", rating_text)
if rating_match:
rating_value = float(rating_match.group(1).replace(',', '.'))
return round(rating_value, 1)
# Look for review text near ratings
review_container = element.select_one(".bui-review-score, .d10a6220b4")
if review_container:
text = review_container.get_text()
rating_match = re.search(r"(\d+[.,]\d+)", text)
if rating_match:
rating_value = float(rating_match.group(1).replace(',', '.'))
return round(rating_value, 1)
except Exception as e:
logger.error(f"Error extracting rating: {e}")
return None
def extract_images_from_soup(soup: BeautifulSoup, url: str, selectors: List[str], max_images: int = 5) -> List[str]:
"""Extract images from HTML using provided selectors"""
images = []
for selector in selectors:
for img in soup.select(selector):
src = img.get("src") or img.get("data-src") or img.get("data-lazy-src")
if src and not is_logo_image(src):
if not src.startswith("http"):
src = urljoin(url, src)
if src not in images:
images.append(src)
if len(images) >= max_images:
return images
return images |