| import json |
| import csv |
| import time |
| import re |
| from pathlib import Path |
| from urllib.parse import urljoin |
|
|
| import requests |
| from bs4 import BeautifulSoup |
|
|
|
|
| BASE_URL = "https://www.shl.com" |
| CATALOG_URL = "https://www.shl.com/solutions/products/product-catalog/?type=1&start={start}" |
|
|
| OUTPUT_DIR = Path("data") |
| JSON_OUTPUT = OUTPUT_DIR / "shl_catalog.json" |
| CSV_OUTPUT = OUTPUT_DIR / "shl_catalog.csv" |
|
|
|
|
| HEADERS = { |
| "User-Agent": ( |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
| "AppleWebKit/537.36 (KHTML, like Gecko) " |
| "Chrome/122.0.0.0 Safari/537.36" |
| ), |
| "Accept-Language": "en-US,en;q=0.9", |
| } |
|
|
|
|
| def clean_text(text: str) -> str: |
| """Remove extra spaces/newlines.""" |
| return re.sub(r"\s+", " ", text).strip() |
|
|
|
|
| def fetch_page(url: str) -> str: |
| """Fetch one webpage safely.""" |
| response = requests.get(url, headers=HEADERS, timeout=20) |
| response.raise_for_status() |
| return response.text |
|
|
|
|
| def extract_test_type(row_text: str) -> str: |
| """ |
| SHL test types are often short labels like: |
| A = Ability |
| B = Biodata |
| C = Competency |
| D = Development |
| K = Knowledge/Skills |
| P = Personality |
| S = Simulation |
| """ |
| possible_types = ["A", "B", "C", "D", "K", "P", "S"] |
|
|
| tokens = re.findall(r"\b[A-Z]\b", row_text) |
| for token in tokens: |
| if token in possible_types: |
| return token |
|
|
| return "Unknown" |
|
|
|
|
| def parse_catalog_list_page(html: str): |
| """ |
| Parse one catalog listing page. |
| Returns basic assessment records. |
| """ |
| soup = BeautifulSoup(html, "html.parser") |
| records = [] |
|
|
| rows = soup.find_all("tr") |
|
|
| for row in rows: |
| link = row.find("a", href=True) |
|
|
| if not link: |
| continue |
|
|
| href = link["href"] |
|
|
| |
| if "product-catalog" not in href: |
| continue |
|
|
| name = clean_text(link.get_text(" ", strip=True)) |
|
|
| if not name or len(name) < 2: |
| continue |
|
|
| url = urljoin(BASE_URL, href) |
| row_text = clean_text(row.get_text(" ", strip=True)) |
| test_type = extract_test_type(row_text) |
|
|
| records.append( |
| { |
| "name": name, |
| "url": url, |
| "test_type": test_type, |
| "raw_row_text": row_text, |
| } |
| ) |
|
|
| return records |
|
|
|
|
| def extract_description_from_detail_page(html: str) -> str: |
| """ |
| Try to extract useful assessment description from detail page. |
| This is intentionally defensive because website HTML may change. |
| """ |
| soup = BeautifulSoup(html, "html.parser") |
|
|
| |
| meta = soup.find("meta", attrs={"name": "description"}) |
| if meta and meta.get("content"): |
| desc = clean_text(meta["content"]) |
| if len(desc) > 40: |
| return desc |
|
|
| |
| paragraphs = [] |
| for p in soup.find_all("p"): |
| text = clean_text(p.get_text(" ", strip=True)) |
| if len(text) > 40: |
| paragraphs.append(text) |
|
|
| if paragraphs: |
| return " ".join(paragraphs[:3]) |
|
|
| |
| return "" |
|
|
|
|
| def build_keywords(name: str, description: str, raw_text: str): |
| """ |
| Build simple searchable keyword list. |
| Later we can improve this with embeddings. |
| """ |
| text = f"{name} {description} {raw_text}".lower() |
|
|
| keyword_map = { |
| "java": ["java"], |
| "python": ["python"], |
| "sql": ["sql", "database"], |
| "javascript": ["javascript", "js"], |
| "developer": ["developer", "software", "programming", "coding"], |
| "cognitive": ["cognitive", "ability", "aptitude", "reasoning"], |
| "personality": ["personality", "opq", "behavior", "behaviour"], |
| "communication": ["communication", "stakeholder", "verbal"], |
| "leadership": ["leadership", "manager", "management"], |
| "sales": ["sales"], |
| "graduate": ["graduate", "entry level", "entry-level"], |
| } |
|
|
| keywords = set() |
|
|
| for label, patterns in keyword_map.items(): |
| for pattern in patterns: |
| if pattern in text: |
| keywords.add(label) |
|
|
| |
| for token in re.findall(r"[a-zA-Z][a-zA-Z0-9+#.-]+", name.lower()): |
| if len(token) > 2: |
| keywords.add(token) |
|
|
| return sorted(keywords) |
|
|
|
|
| def scrape_catalog(max_pages: int = 100): |
| """ |
| Scrape Individual Test Solutions catalog. |
| Pagination usually works with start=0,12,24... |
| """ |
| all_records = [] |
| seen_urls = set() |
|
|
| for page_num in range(max_pages): |
| start = page_num * 12 |
| url = CATALOG_URL.format(start=start) |
|
|
| print(f"Scraping listing page: {url}") |
|
|
| try: |
| html = fetch_page(url) |
| except Exception as e: |
| print(f"Failed to fetch listing page {url}: {e}") |
| break |
|
|
| page_records = parse_catalog_list_page(html) |
|
|
| new_count = 0 |
|
|
| for record in page_records: |
| if record["url"] in seen_urls: |
| continue |
|
|
| seen_urls.add(record["url"]) |
| all_records.append(record) |
| new_count += 1 |
|
|
| print(f"Found {new_count} new assessments") |
|
|
| if new_count == 0: |
| print("No new records found. Stopping pagination.") |
| break |
|
|
| time.sleep(1) |
|
|
| print(f"\nTotal basic records found: {len(all_records)}") |
|
|
| enriched_records = [] |
|
|
| for idx, record in enumerate(all_records, start=1): |
| print(f"[{idx}/{len(all_records)}] Fetching details: {record['name']}") |
|
|
| description = "" |
|
|
| try: |
| detail_html = fetch_page(record["url"]) |
| description = extract_description_from_detail_page(detail_html) |
| except Exception as e: |
| print(f"Failed detail page for {record['name']}: {e}") |
|
|
| final_record = { |
| "name": record["name"], |
| "url": record["url"], |
| "test_type": record["test_type"], |
| "description": description, |
| "keywords": build_keywords( |
| record["name"], |
| description, |
| record.get("raw_row_text", "") |
| ), |
| } |
|
|
| enriched_records.append(final_record) |
| time.sleep(0.5) |
|
|
| return enriched_records |
|
|
|
|
| def save_json(records): |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| with open(JSON_OUTPUT, "w", encoding="utf-8") as f: |
| json.dump(records, f, indent=2, ensure_ascii=False) |
|
|
| print(f"Saved JSON: {JSON_OUTPUT}") |
|
|
|
|
| def save_csv(records): |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| fieldnames = ["name", "url", "test_type", "description", "keywords"] |
|
|
| with open(CSV_OUTPUT, "w", encoding="utf-8", newline="") as f: |
| writer = csv.DictWriter(f, fieldnames=fieldnames) |
| writer.writeheader() |
|
|
| for record in records: |
| row = record.copy() |
| row["keywords"] = ", ".join(record.get("keywords", [])) |
| writer.writerow(row) |
|
|
| print(f"Saved CSV: {CSV_OUTPUT}") |
|
|
|
|
| def main(): |
| records = scrape_catalog() |
|
|
| if not records: |
| raise RuntimeError( |
| "No catalog records scraped. Website may be blocking requests or HTML structure changed." |
| ) |
|
|
| save_json(records) |
| save_csv(records) |
|
|
| print("\nDone.") |
| print(f"Total records saved: {len(records)}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |