from __future__ import annotations from typing import List, Tuple from urllib.parse import urljoin import structlog from bs4 import BeautifulSoup from crawler.storage import ( PAGE_TYPE_DETAIL, PARSE_PARSED, PageRecord, Storage, ) from crawler.utils import canonicalize_url, now_iso logger = structlog.get_logger(__name__) ALLOWED_TEST_TYPES = {"A", "B", "C", "D", "E", "K", "P", "S"} GREEN_TOKENS = ["green", "#8ac640", "rgb(138", "rgb(103", "0, 167, 83", "8ac640"] def _has_green_indicator(cell) -> bool: for el in cell.find_all(True): style = (el.get("style") or "").lower() classes = " ".join(el.get("class", [])).lower() if isinstance(el.get("class"), list) else str(el.get("class") or "").lower() combined = f"{style} {classes}" if any(tok in combined for tok in GREEN_TOKENS): return True if "-yes" in classes or "catalogue__circle" in classes: return True fill = (el.get("fill") or "").lower() if any(tok in fill for tok in GREEN_TOKENS): return True # Generic icon/dot detection (when color is applied via CSS) if el.name in {"svg", "circle", "path", "i"}: return True if "dot" in classes or "indicator" in classes: return True return False def extract_catalog_entries(html: str) -> List[dict]: """Parse catalog page for individual test solutions. This is intentionally defensive; selectors may change on shl.com. We look for anchors within sections that mention "Individual Test Solutions" or tables with product rows. """ soup = BeautifulSoup(html, "lxml") entries = [] tables = soup.find_all("table") for table in tables: headers = " ".join(th.get_text(" ", strip=True) for th in table.find_all("th")) if "Individual Test Solutions" not in headers and "Assessment" not in headers: continue for row in table.find_all("tr"): link = row.find("a", href=True) if not link: continue name = link.get_text(strip=True) detail_url = link["href"] badges_text = [span.get_text("", strip=True) for span in row.find_all("span")] test_letters = [] for token in badges_text: token = token.strip() if len(token) == 1 and token in ALLOWED_TEST_TYPES: test_letters.append(token) test_type = ",".join(dict.fromkeys(test_letters)) or None tds = row.find_all("td") remote = None adaptive = None if len(tds) >= 3: remote = _has_green_indicator(tds[1]) adaptive = _has_green_indicator(tds[2]) else: flat_badges = " ".join(badges_text).lower() remote = "remote" in flat_badges adaptive = "adaptive" in flat_badges or "irt" in flat_badges entries.append( { "name": name, "url": detail_url, "test_type": test_type or None, "remote_support": remote if remote else None, "adaptive_support": adaptive if adaptive else None, } ) return entries def find_next_pages(html: str, source_url: str) -> List[str]: """Find pagination links (Next or numbered) and resolve to absolute URLs.""" soup = BeautifulSoup(html, "lxml") urls = [] for link in soup.find_all("a", href=True): text = link.get_text(" ", strip=True).lower() if "next" in text or text.isdigit(): urls.append(canonicalize_url(urljoin(source_url, link["href"]))) # de-duplicate while preserving order seen = set() deduped = [] for u in urls: if u not in seen: seen.add(u) deduped.append(u) return deduped def parse_catalog_page(html: str, source_url: str, storage: Storage) -> Tuple[int, List[str], List[str]]: entries = extract_catalog_entries(html) discovered_urls: List[str] = [] for entry in entries: detail_url = canonicalize_url(urljoin(source_url, entry["url"])) discovered_urls.append(detail_url) storage.upsert_page( PageRecord( url=detail_url, page_type=PAGE_TYPE_DETAIL, ) ) storage.upsert_assessment( { "url": detail_url, "name": entry.get("name"), "test_type": entry.get("test_type"), "remote_support": entry.get("remote_support"), "adaptive_support": entry.get("adaptive_support"), "source_catalog_page": canonicalize_url(source_url), "discovered_at": now_iso(), } ) storage.update_parse_status(source_url, PARSE_PARSED) next_pages = find_next_pages(html, source_url) logger.info( "catalog.parse.summary", source_url=source_url, discovered=len(discovered_urls), next_pages=len(next_pages), ) return len(entries), discovered_urls, next_pages