|
|
from __future__ import annotations |
|
|
|
|
|
from typing import List, Tuple |
|
|
from urllib.parse import urljoin |
|
|
|
|
|
import structlog |
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
from crawler.storage import ( |
|
|
PAGE_TYPE_DETAIL, |
|
|
PARSE_PARSED, |
|
|
PageRecord, |
|
|
Storage, |
|
|
) |
|
|
from crawler.utils import canonicalize_url, now_iso |
|
|
|
|
|
logger = structlog.get_logger(__name__) |
|
|
|
|
|
ALLOWED_TEST_TYPES = {"A", "B", "C", "D", "E", "K", "P", "S"} |
|
|
GREEN_TOKENS = ["green", "#8ac640", "rgb(138", "rgb(103", "0, 167, 83", "8ac640"] |
|
|
|
|
|
|
|
|
def _has_green_indicator(cell) -> bool: |
|
|
for el in cell.find_all(True): |
|
|
style = (el.get("style") or "").lower() |
|
|
classes = " ".join(el.get("class", [])).lower() if isinstance(el.get("class"), list) else str(el.get("class") or "").lower() |
|
|
combined = f"{style} {classes}" |
|
|
if any(tok in combined for tok in GREEN_TOKENS): |
|
|
return True |
|
|
if "-yes" in classes or "catalogue__circle" in classes: |
|
|
return True |
|
|
fill = (el.get("fill") or "").lower() |
|
|
if any(tok in fill for tok in GREEN_TOKENS): |
|
|
return True |
|
|
|
|
|
if el.name in {"svg", "circle", "path", "i"}: |
|
|
return True |
|
|
if "dot" in classes or "indicator" in classes: |
|
|
return True |
|
|
return False |
|
|
|
|
|
|
|
|
def extract_catalog_entries(html: str) -> List[dict]: |
|
|
"""Parse catalog page for individual test solutions. |
|
|
|
|
|
This is intentionally defensive; selectors may change on shl.com. We look for anchors within |
|
|
sections that mention "Individual Test Solutions" or tables with product rows. |
|
|
""" |
|
|
soup = BeautifulSoup(html, "lxml") |
|
|
entries = [] |
|
|
|
|
|
tables = soup.find_all("table") |
|
|
for table in tables: |
|
|
headers = " ".join(th.get_text(" ", strip=True) for th in table.find_all("th")) |
|
|
if "Individual Test Solutions" not in headers and "Assessment" not in headers: |
|
|
continue |
|
|
for row in table.find_all("tr"): |
|
|
link = row.find("a", href=True) |
|
|
if not link: |
|
|
continue |
|
|
name = link.get_text(strip=True) |
|
|
detail_url = link["href"] |
|
|
badges_text = [span.get_text("", strip=True) for span in row.find_all("span")] |
|
|
test_letters = [] |
|
|
for token in badges_text: |
|
|
token = token.strip() |
|
|
if len(token) == 1 and token in ALLOWED_TEST_TYPES: |
|
|
test_letters.append(token) |
|
|
test_type = ",".join(dict.fromkeys(test_letters)) or None |
|
|
tds = row.find_all("td") |
|
|
remote = None |
|
|
adaptive = None |
|
|
if len(tds) >= 3: |
|
|
remote = _has_green_indicator(tds[1]) |
|
|
adaptive = _has_green_indicator(tds[2]) |
|
|
else: |
|
|
flat_badges = " ".join(badges_text).lower() |
|
|
remote = "remote" in flat_badges |
|
|
adaptive = "adaptive" in flat_badges or "irt" in flat_badges |
|
|
entries.append( |
|
|
{ |
|
|
"name": name, |
|
|
"url": detail_url, |
|
|
"test_type": test_type or None, |
|
|
"remote_support": remote if remote else None, |
|
|
"adaptive_support": adaptive if adaptive else None, |
|
|
} |
|
|
) |
|
|
return entries |
|
|
|
|
|
|
|
|
def find_next_pages(html: str, source_url: str) -> List[str]: |
|
|
"""Find pagination links (Next or numbered) and resolve to absolute URLs.""" |
|
|
soup = BeautifulSoup(html, "lxml") |
|
|
urls = [] |
|
|
for link in soup.find_all("a", href=True): |
|
|
text = link.get_text(" ", strip=True).lower() |
|
|
if "next" in text or text.isdigit(): |
|
|
urls.append(canonicalize_url(urljoin(source_url, link["href"]))) |
|
|
|
|
|
seen = set() |
|
|
deduped = [] |
|
|
for u in urls: |
|
|
if u not in seen: |
|
|
seen.add(u) |
|
|
deduped.append(u) |
|
|
return deduped |
|
|
|
|
|
|
|
|
def parse_catalog_page(html: str, source_url: str, storage: Storage) -> Tuple[int, List[str], List[str]]: |
|
|
entries = extract_catalog_entries(html) |
|
|
discovered_urls: List[str] = [] |
|
|
|
|
|
for entry in entries: |
|
|
detail_url = canonicalize_url(urljoin(source_url, entry["url"])) |
|
|
discovered_urls.append(detail_url) |
|
|
storage.upsert_page( |
|
|
PageRecord( |
|
|
url=detail_url, |
|
|
page_type=PAGE_TYPE_DETAIL, |
|
|
) |
|
|
) |
|
|
storage.upsert_assessment( |
|
|
{ |
|
|
"url": detail_url, |
|
|
"name": entry.get("name"), |
|
|
"test_type": entry.get("test_type"), |
|
|
"remote_support": entry.get("remote_support"), |
|
|
"adaptive_support": entry.get("adaptive_support"), |
|
|
"source_catalog_page": canonicalize_url(source_url), |
|
|
"discovered_at": now_iso(), |
|
|
} |
|
|
) |
|
|
|
|
|
storage.update_parse_status(source_url, PARSE_PARSED) |
|
|
next_pages = find_next_pages(html, source_url) |
|
|
logger.info( |
|
|
"catalog.parse.summary", |
|
|
source_url=source_url, |
|
|
discovered=len(discovered_urls), |
|
|
next_pages=len(next_pages), |
|
|
) |
|
|
return len(entries), discovered_urls, next_pages |
|
|
|