Spaces:

AgamP
/

llm_recommendation_backend

Running

File size: 5,205 Bytes

5a3b322

from __future__ import annotations

from typing import List, Tuple
from urllib.parse import urljoin

import structlog
from bs4 import BeautifulSoup

from crawler.storage import (
    PAGE_TYPE_DETAIL,
    PARSE_PARSED,
    PageRecord,
    Storage,
)
from crawler.utils import canonicalize_url, now_iso

logger = structlog.get_logger(__name__)

ALLOWED_TEST_TYPES = {"A", "B", "C", "D", "E", "K", "P", "S"}
GREEN_TOKENS = ["green", "#8ac640", "rgb(138", "rgb(103", "0, 167, 83", "8ac640"]


def _has_green_indicator(cell) -> bool:
    for el in cell.find_all(True):
        style = (el.get("style") or "").lower()
        classes = " ".join(el.get("class", [])).lower() if isinstance(el.get("class"), list) else str(el.get("class") or "").lower()
        combined = f"{style} {classes}"
        if any(tok in combined for tok in GREEN_TOKENS):
            return True
        if "-yes" in classes or "catalogue__circle" in classes:
            return True
        fill = (el.get("fill") or "").lower()
        if any(tok in fill for tok in GREEN_TOKENS):
            return True
        # Generic icon/dot detection (when color is applied via CSS)
        if el.name in {"svg", "circle", "path", "i"}:
            return True
        if "dot" in classes or "indicator" in classes:
            return True
    return False


def extract_catalog_entries(html: str) -> List[dict]:
    """Parse catalog page for individual test solutions.

    This is intentionally defensive; selectors may change on shl.com. We look for anchors within
    sections that mention "Individual Test Solutions" or tables with product rows.
    """
    soup = BeautifulSoup(html, "lxml")
    entries = []

    tables = soup.find_all("table")
    for table in tables:
        headers = " ".join(th.get_text(" ", strip=True) for th in table.find_all("th"))
        if "Individual Test Solutions" not in headers and "Assessment" not in headers:
            continue
        for row in table.find_all("tr"):
            link = row.find("a", href=True)
            if not link:
                continue
            name = link.get_text(strip=True)
            detail_url = link["href"]
            badges_text = [span.get_text("", strip=True) for span in row.find_all("span")]
            test_letters = []
            for token in badges_text:
                token = token.strip()
                if len(token) == 1 and token in ALLOWED_TEST_TYPES:
                    test_letters.append(token)
            test_type = ",".join(dict.fromkeys(test_letters)) or None
            tds = row.find_all("td")
            remote = None
            adaptive = None
            if len(tds) >= 3:
                remote = _has_green_indicator(tds[1])
                adaptive = _has_green_indicator(tds[2])
            else:
                flat_badges = " ".join(badges_text).lower()
                remote = "remote" in flat_badges
                adaptive = "adaptive" in flat_badges or "irt" in flat_badges
            entries.append(
                {
                    "name": name,
                    "url": detail_url,
                    "test_type": test_type or None,
                    "remote_support": remote if remote else None,
                    "adaptive_support": adaptive if adaptive else None,
                }
            )
    return entries


def find_next_pages(html: str, source_url: str) -> List[str]:
    """Find pagination links (Next or numbered) and resolve to absolute URLs."""
    soup = BeautifulSoup(html, "lxml")
    urls = []
    for link in soup.find_all("a", href=True):
        text = link.get_text(" ", strip=True).lower()
        if "next" in text or text.isdigit():
            urls.append(canonicalize_url(urljoin(source_url, link["href"])))
    # de-duplicate while preserving order
    seen = set()
    deduped = []
    for u in urls:
        if u not in seen:
            seen.add(u)
            deduped.append(u)
    return deduped


def parse_catalog_page(html: str, source_url: str, storage: Storage) -> Tuple[int, List[str], List[str]]:
    entries = extract_catalog_entries(html)
    discovered_urls: List[str] = []

    for entry in entries:
        detail_url = canonicalize_url(urljoin(source_url, entry["url"]))
        discovered_urls.append(detail_url)
        storage.upsert_page(
            PageRecord(
                url=detail_url,
                page_type=PAGE_TYPE_DETAIL,
            )
        )
        storage.upsert_assessment(
            {
                "url": detail_url,
                "name": entry.get("name"),
                "test_type": entry.get("test_type"),
                "remote_support": entry.get("remote_support"),
                "adaptive_support": entry.get("adaptive_support"),
                "source_catalog_page": canonicalize_url(source_url),
                "discovered_at": now_iso(),
            }
        )

    storage.update_parse_status(source_url, PARSE_PARSED)
    next_pages = find_next_pages(html, source_url)
    logger.info(
        "catalog.parse.summary",
        source_url=source_url,
        discovered=len(discovered_urls),
        next_pages=len(next_pages),
    )
    return len(entries), discovered_urls, next_pages