File size: 2,031 Bytes
28df1e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from bs4 import BeautifulSoup # type: ignore
from typing import List, Optional
from urllib.parse import urljoin
import re
import logging
from .image_utils import is_logo_image

logger = logging.getLogger(__name__)

def extract_rating_from_element(element) -> Optional[float]:
    """Extract rating from an HTML element"""
    try:
        rating_elem = element.select_one(".bui-review-score__badge") or element.select_one("[data-testid='review-score']")
        
        if rating_elem:
            rating_text = rating_elem.text.strip()
            rating_match = re.search(r"(\d+[.,]?\d*)", rating_text)
            if rating_match:
                rating_value = float(rating_match.group(1).replace(',', '.'))
                return round(rating_value, 1)
        
        # Look for review text near ratings
        review_container = element.select_one(".bui-review-score, .d10a6220b4")
        if review_container:
            text = review_container.get_text()
            rating_match = re.search(r"(\d+[.,]\d+)", text)
            if rating_match:
                rating_value = float(rating_match.group(1).replace(',', '.'))
                return round(rating_value, 1)
    except Exception as e:
        logger.error(f"Error extracting rating: {e}")
    
    return None

def extract_images_from_soup(soup: BeautifulSoup, url: str, selectors: List[str], max_images: int = 5) -> List[str]:
    """Extract images from HTML using provided selectors"""
    images = []
    
    for selector in selectors:
        for img in soup.select(selector):
            src = img.get("src") or img.get("data-src") or img.get("data-lazy-src")
            if src and not is_logo_image(src):
                if not src.startswith("http"):
                    src = urljoin(url, src)
                
                if src not in images:
                    images.append(src)
                    if len(images) >= max_images:
                        return images
    
    return images