Spaces:

kanhacoderx
/

SHL-Assessment-Recommender

Sleeping

File size: 7,397 Bytes

a358bfb

import json
import csv
import time
import re
from pathlib import Path
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup


BASE_URL = "https://www.shl.com"
CATALOG_URL = "https://www.shl.com/solutions/products/product-catalog/?type=1&start={start}"

OUTPUT_DIR = Path("data")
JSON_OUTPUT = OUTPUT_DIR / "shl_catalog.json"
CSV_OUTPUT = OUTPUT_DIR / "shl_catalog.csv"


HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}


def clean_text(text: str) -> str:
    """Remove extra spaces/newlines."""
    return re.sub(r"\s+", " ", text).strip()


def fetch_page(url: str) -> str:
    """Fetch one webpage safely."""
    response = requests.get(url, headers=HEADERS, timeout=20)
    response.raise_for_status()
    return response.text


def extract_test_type(row_text: str) -> str:
    """
    SHL test types are often short labels like:
    A = Ability
    B = Biodata
    C = Competency
    D = Development
    K = Knowledge/Skills
    P = Personality
    S = Simulation
    """
    possible_types = ["A", "B", "C", "D", "K", "P", "S"]

    tokens = re.findall(r"\b[A-Z]\b", row_text)
    for token in tokens:
        if token in possible_types:
            return token

    return "Unknown"


def parse_catalog_list_page(html: str):
    """
    Parse one catalog listing page.
    Returns basic assessment records.
    """
    soup = BeautifulSoup(html, "html.parser")
    records = []

    rows = soup.find_all("tr")

    for row in rows:
        link = row.find("a", href=True)

        if not link:
            continue

        href = link["href"]

        # Product detail pages usually contain product-catalog in URL
        if "product-catalog" not in href:
            continue

        name = clean_text(link.get_text(" ", strip=True))

        if not name or len(name) < 2:
            continue

        url = urljoin(BASE_URL, href)
        row_text = clean_text(row.get_text(" ", strip=True))
        test_type = extract_test_type(row_text)

        records.append(
            {
                "name": name,
                "url": url,
                "test_type": test_type,
                "raw_row_text": row_text,
            }
        )

    return records


def extract_description_from_detail_page(html: str) -> str:
    """
    Try to extract useful assessment description from detail page.
    This is intentionally defensive because website HTML may change.
    """
    soup = BeautifulSoup(html, "html.parser")

    # 1. Try meta description
    meta = soup.find("meta", attrs={"name": "description"})
    if meta and meta.get("content"):
        desc = clean_text(meta["content"])
        if len(desc) > 40:
            return desc

    # 2. Try paragraphs
    paragraphs = []
    for p in soup.find_all("p"):
        text = clean_text(p.get_text(" ", strip=True))
        if len(text) > 40:
            paragraphs.append(text)

    if paragraphs:
        return " ".join(paragraphs[:3])

    # 3. Fallback
    return ""


def build_keywords(name: str, description: str, raw_text: str):
    """
    Build simple searchable keyword list.
    Later we can improve this with embeddings.
    """
    text = f"{name} {description} {raw_text}".lower()

    keyword_map = {
        "java": ["java"],
        "python": ["python"],
        "sql": ["sql", "database"],
        "javascript": ["javascript", "js"],
        "developer": ["developer", "software", "programming", "coding"],
        "cognitive": ["cognitive", "ability", "aptitude", "reasoning"],
        "personality": ["personality", "opq", "behavior", "behaviour"],
        "communication": ["communication", "stakeholder", "verbal"],
        "leadership": ["leadership", "manager", "management"],
        "sales": ["sales"],
        "graduate": ["graduate", "entry level", "entry-level"],
    }

    keywords = set()

    for label, patterns in keyword_map.items():
        for pattern in patterns:
            if pattern in text:
                keywords.add(label)

    # Add useful words from name
    for token in re.findall(r"[a-zA-Z][a-zA-Z0-9+#.-]+", name.lower()):
        if len(token) > 2:
            keywords.add(token)

    return sorted(keywords)


def scrape_catalog(max_pages: int = 100):
    """
    Scrape Individual Test Solutions catalog.
    Pagination usually works with start=0,12,24...
    """
    all_records = []
    seen_urls = set()

    for page_num in range(max_pages):
        start = page_num * 12
        url = CATALOG_URL.format(start=start)

        print(f"Scraping listing page: {url}")

        try:
            html = fetch_page(url)
        except Exception as e:
            print(f"Failed to fetch listing page {url}: {e}")
            break

        page_records = parse_catalog_list_page(html)

        new_count = 0

        for record in page_records:
            if record["url"] in seen_urls:
                continue

            seen_urls.add(record["url"])
            all_records.append(record)
            new_count += 1

        print(f"Found {new_count} new assessments")

        if new_count == 0:
            print("No new records found. Stopping pagination.")
            break

        time.sleep(1)

    print(f"\nTotal basic records found: {len(all_records)}")

    enriched_records = []

    for idx, record in enumerate(all_records, start=1):
        print(f"[{idx}/{len(all_records)}] Fetching details: {record['name']}")

        description = ""

        try:
            detail_html = fetch_page(record["url"])
            description = extract_description_from_detail_page(detail_html)
        except Exception as e:
            print(f"Failed detail page for {record['name']}: {e}")

        final_record = {
            "name": record["name"],
            "url": record["url"],
            "test_type": record["test_type"],
            "description": description,
            "keywords": build_keywords(
                record["name"],
                description,
                record.get("raw_row_text", "")
            ),
        }

        enriched_records.append(final_record)
        time.sleep(0.5)

    return enriched_records


def save_json(records):
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    with open(JSON_OUTPUT, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=2, ensure_ascii=False)

    print(f"Saved JSON: {JSON_OUTPUT}")


def save_csv(records):
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    fieldnames = ["name", "url", "test_type", "description", "keywords"]

    with open(CSV_OUTPUT, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for record in records:
            row = record.copy()
            row["keywords"] = ", ".join(record.get("keywords", []))
            writer.writerow(row)

    print(f"Saved CSV: {CSV_OUTPUT}")


def main():
    records = scrape_catalog()

    if not records:
        raise RuntimeError(
            "No catalog records scraped. Website may be blocking requests or HTML structure changed."
        )

    save_json(records)
    save_csv(records)

    print("\nDone.")
    print(f"Total records saved: {len(records)}")


if __name__ == "__main__":
    main()