File size: 5,205 Bytes
5a3b322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from __future__ import annotations

from typing import List, Tuple
from urllib.parse import urljoin

import structlog
from bs4 import BeautifulSoup

from crawler.storage import (
    PAGE_TYPE_DETAIL,
    PARSE_PARSED,
    PageRecord,
    Storage,
)
from crawler.utils import canonicalize_url, now_iso

logger = structlog.get_logger(__name__)

ALLOWED_TEST_TYPES = {"A", "B", "C", "D", "E", "K", "P", "S"}
GREEN_TOKENS = ["green", "#8ac640", "rgb(138", "rgb(103", "0, 167, 83", "8ac640"]


def _has_green_indicator(cell) -> bool:
    for el in cell.find_all(True):
        style = (el.get("style") or "").lower()
        classes = " ".join(el.get("class", [])).lower() if isinstance(el.get("class"), list) else str(el.get("class") or "").lower()
        combined = f"{style} {classes}"
        if any(tok in combined for tok in GREEN_TOKENS):
            return True
        if "-yes" in classes or "catalogue__circle" in classes:
            return True
        fill = (el.get("fill") or "").lower()
        if any(tok in fill for tok in GREEN_TOKENS):
            return True
        # Generic icon/dot detection (when color is applied via CSS)
        if el.name in {"svg", "circle", "path", "i"}:
            return True
        if "dot" in classes or "indicator" in classes:
            return True
    return False


def extract_catalog_entries(html: str) -> List[dict]:
    """Parse catalog page for individual test solutions.

    This is intentionally defensive; selectors may change on shl.com. We look for anchors within
    sections that mention "Individual Test Solutions" or tables with product rows.
    """
    soup = BeautifulSoup(html, "lxml")
    entries = []

    tables = soup.find_all("table")
    for table in tables:
        headers = " ".join(th.get_text(" ", strip=True) for th in table.find_all("th"))
        if "Individual Test Solutions" not in headers and "Assessment" not in headers:
            continue
        for row in table.find_all("tr"):
            link = row.find("a", href=True)
            if not link:
                continue
            name = link.get_text(strip=True)
            detail_url = link["href"]
            badges_text = [span.get_text("", strip=True) for span in row.find_all("span")]
            test_letters = []
            for token in badges_text:
                token = token.strip()
                if len(token) == 1 and token in ALLOWED_TEST_TYPES:
                    test_letters.append(token)
            test_type = ",".join(dict.fromkeys(test_letters)) or None
            tds = row.find_all("td")
            remote = None
            adaptive = None
            if len(tds) >= 3:
                remote = _has_green_indicator(tds[1])
                adaptive = _has_green_indicator(tds[2])
            else:
                flat_badges = " ".join(badges_text).lower()
                remote = "remote" in flat_badges
                adaptive = "adaptive" in flat_badges or "irt" in flat_badges
            entries.append(
                {
                    "name": name,
                    "url": detail_url,
                    "test_type": test_type or None,
                    "remote_support": remote if remote else None,
                    "adaptive_support": adaptive if adaptive else None,
                }
            )
    return entries


def find_next_pages(html: str, source_url: str) -> List[str]:
    """Find pagination links (Next or numbered) and resolve to absolute URLs."""
    soup = BeautifulSoup(html, "lxml")
    urls = []
    for link in soup.find_all("a", href=True):
        text = link.get_text(" ", strip=True).lower()
        if "next" in text or text.isdigit():
            urls.append(canonicalize_url(urljoin(source_url, link["href"])))
    # de-duplicate while preserving order
    seen = set()
    deduped = []
    for u in urls:
        if u not in seen:
            seen.add(u)
            deduped.append(u)
    return deduped


def parse_catalog_page(html: str, source_url: str, storage: Storage) -> Tuple[int, List[str], List[str]]:
    entries = extract_catalog_entries(html)
    discovered_urls: List[str] = []

    for entry in entries:
        detail_url = canonicalize_url(urljoin(source_url, entry["url"]))
        discovered_urls.append(detail_url)
        storage.upsert_page(
            PageRecord(
                url=detail_url,
                page_type=PAGE_TYPE_DETAIL,
            )
        )
        storage.upsert_assessment(
            {
                "url": detail_url,
                "name": entry.get("name"),
                "test_type": entry.get("test_type"),
                "remote_support": entry.get("remote_support"),
                "adaptive_support": entry.get("adaptive_support"),
                "source_catalog_page": canonicalize_url(source_url),
                "discovered_at": now_iso(),
            }
        )

    storage.update_parse_status(source_url, PARSE_PARSED)
    next_pages = find_next_pages(html, source_url)
    logger.info(
        "catalog.parse.summary",
        source_url=source_url,
        discovered=len(discovered_urls),
        next_pages=len(next_pages),
    )
    return len(entries), discovered_urls, next_pages