import json import csv import time import re from pathlib import Path from urllib.parse import urljoin import requests from bs4 import BeautifulSoup BASE_URL = "https://www.shl.com" CATALOG_URL = "https://www.shl.com/solutions/products/product-catalog/?type=1&start={start}" OUTPUT_DIR = Path("data") JSON_OUTPUT = OUTPUT_DIR / "shl_catalog.json" CSV_OUTPUT = OUTPUT_DIR / "shl_catalog.csv" HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/122.0.0.0 Safari/537.36" ), "Accept-Language": "en-US,en;q=0.9", } def clean_text(text: str) -> str: """Remove extra spaces/newlines.""" return re.sub(r"\s+", " ", text).strip() def fetch_page(url: str) -> str: """Fetch one webpage safely.""" response = requests.get(url, headers=HEADERS, timeout=20) response.raise_for_status() return response.text def extract_test_type(row_text: str) -> str: """ SHL test types are often short labels like: A = Ability B = Biodata C = Competency D = Development K = Knowledge/Skills P = Personality S = Simulation """ possible_types = ["A", "B", "C", "D", "K", "P", "S"] tokens = re.findall(r"\b[A-Z]\b", row_text) for token in tokens: if token in possible_types: return token return "Unknown" def parse_catalog_list_page(html: str): """ Parse one catalog listing page. Returns basic assessment records. """ soup = BeautifulSoup(html, "html.parser") records = [] rows = soup.find_all("tr") for row in rows: link = row.find("a", href=True) if not link: continue href = link["href"] # Product detail pages usually contain product-catalog in URL if "product-catalog" not in href: continue name = clean_text(link.get_text(" ", strip=True)) if not name or len(name) < 2: continue url = urljoin(BASE_URL, href) row_text = clean_text(row.get_text(" ", strip=True)) test_type = extract_test_type(row_text) records.append( { "name": name, "url": url, "test_type": test_type, "raw_row_text": row_text, } ) return records def extract_description_from_detail_page(html: str) -> str: """ Try to extract useful assessment description from detail page. This is intentionally defensive because website HTML may change. """ soup = BeautifulSoup(html, "html.parser") # 1. Try meta description meta = soup.find("meta", attrs={"name": "description"}) if meta and meta.get("content"): desc = clean_text(meta["content"]) if len(desc) > 40: return desc # 2. Try paragraphs paragraphs = [] for p in soup.find_all("p"): text = clean_text(p.get_text(" ", strip=True)) if len(text) > 40: paragraphs.append(text) if paragraphs: return " ".join(paragraphs[:3]) # 3. Fallback return "" def build_keywords(name: str, description: str, raw_text: str): """ Build simple searchable keyword list. Later we can improve this with embeddings. """ text = f"{name} {description} {raw_text}".lower() keyword_map = { "java": ["java"], "python": ["python"], "sql": ["sql", "database"], "javascript": ["javascript", "js"], "developer": ["developer", "software", "programming", "coding"], "cognitive": ["cognitive", "ability", "aptitude", "reasoning"], "personality": ["personality", "opq", "behavior", "behaviour"], "communication": ["communication", "stakeholder", "verbal"], "leadership": ["leadership", "manager", "management"], "sales": ["sales"], "graduate": ["graduate", "entry level", "entry-level"], } keywords = set() for label, patterns in keyword_map.items(): for pattern in patterns: if pattern in text: keywords.add(label) # Add useful words from name for token in re.findall(r"[a-zA-Z][a-zA-Z0-9+#.-]+", name.lower()): if len(token) > 2: keywords.add(token) return sorted(keywords) def scrape_catalog(max_pages: int = 100): """ Scrape Individual Test Solutions catalog. Pagination usually works with start=0,12,24... """ all_records = [] seen_urls = set() for page_num in range(max_pages): start = page_num * 12 url = CATALOG_URL.format(start=start) print(f"Scraping listing page: {url}") try: html = fetch_page(url) except Exception as e: print(f"Failed to fetch listing page {url}: {e}") break page_records = parse_catalog_list_page(html) new_count = 0 for record in page_records: if record["url"] in seen_urls: continue seen_urls.add(record["url"]) all_records.append(record) new_count += 1 print(f"Found {new_count} new assessments") if new_count == 0: print("No new records found. Stopping pagination.") break time.sleep(1) print(f"\nTotal basic records found: {len(all_records)}") enriched_records = [] for idx, record in enumerate(all_records, start=1): print(f"[{idx}/{len(all_records)}] Fetching details: {record['name']}") description = "" try: detail_html = fetch_page(record["url"]) description = extract_description_from_detail_page(detail_html) except Exception as e: print(f"Failed detail page for {record['name']}: {e}") final_record = { "name": record["name"], "url": record["url"], "test_type": record["test_type"], "description": description, "keywords": build_keywords( record["name"], description, record.get("raw_row_text", "") ), } enriched_records.append(final_record) time.sleep(0.5) return enriched_records def save_json(records): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) with open(JSON_OUTPUT, "w", encoding="utf-8") as f: json.dump(records, f, indent=2, ensure_ascii=False) print(f"Saved JSON: {JSON_OUTPUT}") def save_csv(records): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) fieldnames = ["name", "url", "test_type", "description", "keywords"] with open(CSV_OUTPUT, "w", encoding="utf-8", newline="") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for record in records: row = record.copy() row["keywords"] = ", ".join(record.get("keywords", [])) writer.writerow(row) print(f"Saved CSV: {CSV_OUTPUT}") def main(): records = scrape_catalog() if not records: raise RuntimeError( "No catalog records scraped. Website may be blocking requests or HTML structure changed." ) save_json(records) save_csv(records) print("\nDone.") print(f"Total records saved: {len(records)}") if __name__ == "__main__": main()