SHL-Assessment-Recommender / scripts /scrape_catalog.py
kanhacoderx's picture
Create scripts/scrape_catalog.py
a358bfb verified
import json
import csv
import time
import re
from pathlib import Path
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://www.shl.com"
CATALOG_URL = "https://www.shl.com/solutions/products/product-catalog/?type=1&start={start}"
OUTPUT_DIR = Path("data")
JSON_OUTPUT = OUTPUT_DIR / "shl_catalog.json"
CSV_OUTPUT = OUTPUT_DIR / "shl_catalog.csv"
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
}
def clean_text(text: str) -> str:
"""Remove extra spaces/newlines."""
return re.sub(r"\s+", " ", text).strip()
def fetch_page(url: str) -> str:
"""Fetch one webpage safely."""
response = requests.get(url, headers=HEADERS, timeout=20)
response.raise_for_status()
return response.text
def extract_test_type(row_text: str) -> str:
"""
SHL test types are often short labels like:
A = Ability
B = Biodata
C = Competency
D = Development
K = Knowledge/Skills
P = Personality
S = Simulation
"""
possible_types = ["A", "B", "C", "D", "K", "P", "S"]
tokens = re.findall(r"\b[A-Z]\b", row_text)
for token in tokens:
if token in possible_types:
return token
return "Unknown"
def parse_catalog_list_page(html: str):
"""
Parse one catalog listing page.
Returns basic assessment records.
"""
soup = BeautifulSoup(html, "html.parser")
records = []
rows = soup.find_all("tr")
for row in rows:
link = row.find("a", href=True)
if not link:
continue
href = link["href"]
# Product detail pages usually contain product-catalog in URL
if "product-catalog" not in href:
continue
name = clean_text(link.get_text(" ", strip=True))
if not name or len(name) < 2:
continue
url = urljoin(BASE_URL, href)
row_text = clean_text(row.get_text(" ", strip=True))
test_type = extract_test_type(row_text)
records.append(
{
"name": name,
"url": url,
"test_type": test_type,
"raw_row_text": row_text,
}
)
return records
def extract_description_from_detail_page(html: str) -> str:
"""
Try to extract useful assessment description from detail page.
This is intentionally defensive because website HTML may change.
"""
soup = BeautifulSoup(html, "html.parser")
# 1. Try meta description
meta = soup.find("meta", attrs={"name": "description"})
if meta and meta.get("content"):
desc = clean_text(meta["content"])
if len(desc) > 40:
return desc
# 2. Try paragraphs
paragraphs = []
for p in soup.find_all("p"):
text = clean_text(p.get_text(" ", strip=True))
if len(text) > 40:
paragraphs.append(text)
if paragraphs:
return " ".join(paragraphs[:3])
# 3. Fallback
return ""
def build_keywords(name: str, description: str, raw_text: str):
"""
Build simple searchable keyword list.
Later we can improve this with embeddings.
"""
text = f"{name} {description} {raw_text}".lower()
keyword_map = {
"java": ["java"],
"python": ["python"],
"sql": ["sql", "database"],
"javascript": ["javascript", "js"],
"developer": ["developer", "software", "programming", "coding"],
"cognitive": ["cognitive", "ability", "aptitude", "reasoning"],
"personality": ["personality", "opq", "behavior", "behaviour"],
"communication": ["communication", "stakeholder", "verbal"],
"leadership": ["leadership", "manager", "management"],
"sales": ["sales"],
"graduate": ["graduate", "entry level", "entry-level"],
}
keywords = set()
for label, patterns in keyword_map.items():
for pattern in patterns:
if pattern in text:
keywords.add(label)
# Add useful words from name
for token in re.findall(r"[a-zA-Z][a-zA-Z0-9+#.-]+", name.lower()):
if len(token) > 2:
keywords.add(token)
return sorted(keywords)
def scrape_catalog(max_pages: int = 100):
"""
Scrape Individual Test Solutions catalog.
Pagination usually works with start=0,12,24...
"""
all_records = []
seen_urls = set()
for page_num in range(max_pages):
start = page_num * 12
url = CATALOG_URL.format(start=start)
print(f"Scraping listing page: {url}")
try:
html = fetch_page(url)
except Exception as e:
print(f"Failed to fetch listing page {url}: {e}")
break
page_records = parse_catalog_list_page(html)
new_count = 0
for record in page_records:
if record["url"] in seen_urls:
continue
seen_urls.add(record["url"])
all_records.append(record)
new_count += 1
print(f"Found {new_count} new assessments")
if new_count == 0:
print("No new records found. Stopping pagination.")
break
time.sleep(1)
print(f"\nTotal basic records found: {len(all_records)}")
enriched_records = []
for idx, record in enumerate(all_records, start=1):
print(f"[{idx}/{len(all_records)}] Fetching details: {record['name']}")
description = ""
try:
detail_html = fetch_page(record["url"])
description = extract_description_from_detail_page(detail_html)
except Exception as e:
print(f"Failed detail page for {record['name']}: {e}")
final_record = {
"name": record["name"],
"url": record["url"],
"test_type": record["test_type"],
"description": description,
"keywords": build_keywords(
record["name"],
description,
record.get("raw_row_text", "")
),
}
enriched_records.append(final_record)
time.sleep(0.5)
return enriched_records
def save_json(records):
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
with open(JSON_OUTPUT, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
print(f"Saved JSON: {JSON_OUTPUT}")
def save_csv(records):
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
fieldnames = ["name", "url", "test_type", "description", "keywords"]
with open(CSV_OUTPUT, "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for record in records:
row = record.copy()
row["keywords"] = ", ".join(record.get("keywords", []))
writer.writerow(row)
print(f"Saved CSV: {CSV_OUTPUT}")
def main():
records = scrape_catalog()
if not records:
raise RuntimeError(
"No catalog records scraped. Website may be blocking requests or HTML structure changed."
)
save_json(records)
save_csv(records)
print("\nDone.")
print(f"Total records saved: {len(records)}")
if __name__ == "__main__":
main()