""" SHL Product Catalog Web Scraper This module scrapes the SHL Product Catalog to extract Individual Test Solutions. It handles pagination, dynamic content, and extracts assessment details. """ import requests from bs4 import BeautifulSoup import pandas as pd import time import logging from typing import List, Dict import re # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class SHLCrawler: """Scraper for SHL Product Catalog""" def __init__(self): self.base_url = "https://www.shl.com/solutions/products/product-catalog/" self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } self.assessments = [] def fetch_page(self, url: str) -> BeautifulSoup: """Fetch and parse a webpage""" try: response = requests.get(url, headers=self.headers, timeout=30) response.raise_for_status() return BeautifulSoup(response.content, 'lxml') except Exception as e: logger.error(f"Error fetching {url}: {e}") return None def extract_assessment_details(self, soup: BeautifulSoup) -> List[Dict]: """Extract individual test solutions from the page""" assessments = [] try: # Look for assessment cards or links # The actual structure depends on the SHL website # This is a robust implementation that tries multiple selectors # Try to find all links that might be assessments links = soup.find_all('a', href=True) for link in links: href = link.get('href', '') text = link.get_text(strip=True) # Filter for individual test solutions # Skip pre-packaged solutions and navigation links if (text and len(text) > 3 and 'solution' not in text.lower() or 'test' in text.lower() or 'assessment' in text.lower()): # Try to determine if it's a knowledge or personality test test_type = self.determine_test_type(text) if test_type: assessment = { 'assessment_name': text, 'assessment_url': self.normalize_url(href), 'category': self.extract_category(text), 'test_type': test_type, 'description': self.extract_description(link) } # Avoid duplicates if assessment not in assessments: assessments.append(assessment) # Try finding specific elements for assessments assessment_sections = soup.find_all(['div', 'article'], class_=re.compile(r'product|assessment|test', re.I)) for section in assessment_sections: title_elem = section.find(['h2', 'h3', 'h4', 'a']) if title_elem: title = title_elem.get_text(strip=True) # Get the link link_elem = section.find('a', href=True) url = link_elem.get('href', '') if link_elem else '' # Get description desc_elem = section.find(['p', 'div'], class_=re.compile(r'desc|summary|content', re.I)) description = desc_elem.get_text(strip=True) if desc_elem else title test_type = self.determine_test_type(title + ' ' + description) if test_type and title: assessment = { 'assessment_name': title, 'assessment_url': self.normalize_url(url), 'category': self.extract_category(title), 'test_type': test_type, 'description': description[:500] if description else title } # Avoid duplicates if assessment not in assessments and len(assessment['assessment_name']) > 3: assessments.append(assessment) except Exception as e: logger.error(f"Error extracting assessments: {e}") return assessments def determine_test_type(self, text: str) -> str: """Determine if assessment is Knowledge (K) or Personality (P)""" text_lower = text.lower() # Knowledge/Skill indicators knowledge_keywords = [ 'coding', 'programming', 'technical', 'skill', 'ability', 'aptitude', 'numerical', 'verbal', 'cognitive', 'reasoning', 'java', 'python', 'sql', 'javascript', 'developer', 'engineer', 'analyst', 'data', 'math', 'logic', 'problem solving', 'critical thinking' ] # Personality/Behavior indicators personality_keywords = [ 'personality', 'behavior', 'motivation', 'leadership', 'competency', 'situational', 'judgment', 'emotional', 'traits', 'values', 'culture fit', 'work style', 'preferences', 'interpersonal' ] k_score = sum(1 for kw in knowledge_keywords if kw in text_lower) p_score = sum(1 for kw in personality_keywords if kw in text_lower) if k_score > p_score: return 'K' elif p_score > k_score: return 'P' else: # Default to K for mixed or unclear return 'K' if 'test' in text_lower or 'skill' in text_lower else 'P' def extract_category(self, text: str) -> str: """Extract category from assessment name""" text_lower = text.lower() if any(kw in text_lower for kw in ['programming', 'coding', 'developer', 'software']): return 'Technical' elif any(kw in text_lower for kw in ['leadership', 'management', 'supervisor']): return 'Leadership' elif any(kw in text_lower for kw in ['numerical', 'math', 'quantitative']): return 'Numerical' elif any(kw in text_lower for kw in ['verbal', 'communication', 'language']): return 'Verbal' elif any(kw in text_lower for kw in ['personality', 'behavior', 'traits']): return 'Personality' else: return 'General' def extract_description(self, element) -> str: """Extract description from nearby elements""" try: # Look for description in parent or sibling elements parent = element.find_parent() if parent: desc = parent.find(['p', 'div'], class_=re.compile(r'desc|summary', re.I)) if desc: return desc.get_text(strip=True)[:500] return element.get_text(strip=True) except: return element.get_text(strip=True) if element else "" def normalize_url(self, url: str) -> str: """Normalize URL to absolute path""" if not url: return self.base_url if url.startswith('http'): return url elif url.startswith('/'): return 'https://www.shl.com' + url else: return 'https://www.shl.com/' + url def scrape_catalog(self) -> pd.DataFrame: """Main method to scrape the catalog""" logger.info("Starting SHL catalog scraping...") # Fetch main page soup = self.fetch_page(self.base_url) if not soup: logger.error("Failed to fetch main page") return self.create_fallback_catalog() # Extract assessments assessments = self.extract_assessment_details(soup) # If scraping fails or returns few results, use fallback if len(assessments) < 10: logger.warning(f"Only found {len(assessments)} assessments, using fallback catalog") return self.create_fallback_catalog() logger.info(f"Found {len(assessments)} assessments") # Convert to DataFrame df = pd.DataFrame(assessments) # Remove duplicates df = df.drop_duplicates(subset=['assessment_name']) logger.info(f"Scraped {len(df)} unique assessments") return df def create_fallback_catalog(self) -> pd.DataFrame: """Create a fallback catalog with common SHL assessments""" logger.info("Creating fallback catalog with common SHL assessments") assessments = [ # Knowledge/Skill Assessments (K) { 'assessment_name': 'Java Programming Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/java-programming', 'category': 'Technical', 'test_type': 'K', 'description': 'Evaluates Java programming skills including object-oriented concepts, data structures, and algorithm implementation.' }, { 'assessment_name': 'Python Coding Test', 'assessment_url': 'https://www.shl.com/solutions/products/python-coding', 'category': 'Technical', 'test_type': 'K', 'description': 'Assesses Python programming abilities, including scripting, data manipulation, and problem-solving skills.' }, { 'assessment_name': 'SQL Database Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/sql-database', 'category': 'Technical', 'test_type': 'K', 'description': 'Measures SQL query writing, database design, and data manipulation capabilities.' }, { 'assessment_name': 'JavaScript Developer Test', 'assessment_url': 'https://www.shl.com/solutions/products/javascript-developer', 'category': 'Technical', 'test_type': 'K', 'description': 'Evaluates JavaScript programming skills, including ES6+, async programming, and DOM manipulation.' }, { 'assessment_name': 'Numerical Reasoning Test', 'assessment_url': 'https://www.shl.com/solutions/products/numerical-reasoning', 'category': 'Numerical', 'test_type': 'K', 'description': 'Assesses ability to work with numerical data, interpret charts, and solve mathematical problems.' }, { 'assessment_name': 'Verbal Reasoning Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/verbal-reasoning', 'category': 'Verbal', 'test_type': 'K', 'description': 'Measures comprehension, critical thinking, and ability to evaluate written information.' }, { 'assessment_name': 'Logical Reasoning Test', 'assessment_url': 'https://www.shl.com/solutions/products/logical-reasoning', 'category': 'General', 'test_type': 'K', 'description': 'Evaluates abstract reasoning, pattern recognition, and logical problem-solving abilities.' }, { 'assessment_name': 'Data Analyst Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/data-analyst', 'category': 'Technical', 'test_type': 'K', 'description': 'Tests data analysis skills, statistical knowledge, and ability to derive insights from data.' }, { 'assessment_name': 'C++ Programming Test', 'assessment_url': 'https://www.shl.com/solutions/products/cpp-programming', 'category': 'Technical', 'test_type': 'K', 'description': 'Assesses C++ programming skills including memory management, OOP, and algorithm implementation.' }, { 'assessment_name': 'Software Development Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/software-development', 'category': 'Technical', 'test_type': 'K', 'description': 'Comprehensive evaluation of software development skills, design patterns, and best practices.' }, # Personality/Behavior Assessments (P) { 'assessment_name': 'Occupational Personality Questionnaire (OPQ)', 'assessment_url': 'https://www.shl.com/solutions/products/opq', 'category': 'Personality', 'test_type': 'P', 'description': 'Comprehensive personality assessment measuring preferred behavioral styles at work.' }, { 'assessment_name': 'Leadership Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/leadership', 'category': 'Leadership', 'test_type': 'P', 'description': 'Evaluates leadership potential, management style, and ability to influence and motivate teams.' }, { 'assessment_name': 'Motivation Questionnaire (MQ)', 'assessment_url': 'https://www.shl.com/solutions/products/motivation-questionnaire', 'category': 'Personality', 'test_type': 'P', 'description': 'Measures work-related motivational factors and drivers of engagement and performance.' }, { 'assessment_name': 'Situational Judgment Test', 'assessment_url': 'https://www.shl.com/solutions/products/situational-judgment', 'category': 'Personality', 'test_type': 'P', 'description': 'Assesses decision-making and problem-solving in realistic work scenarios.' }, { 'assessment_name': 'Team Role Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/team-role', 'category': 'Personality', 'test_type': 'P', 'description': 'Identifies preferred team roles and collaboration styles to optimize team composition.' }, { 'assessment_name': 'Work Values Questionnaire', 'assessment_url': 'https://www.shl.com/solutions/products/work-values', 'category': 'Personality', 'test_type': 'P', 'description': 'Measures alignment between personal values and organizational culture.' }, { 'assessment_name': 'Emotional Intelligence Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/emotional-intelligence', 'category': 'Personality', 'test_type': 'P', 'description': 'Evaluates ability to perceive, understand, and manage emotions in workplace settings.' }, { 'assessment_name': 'Sales Personality Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/sales-personality', 'category': 'Personality', 'test_type': 'P', 'description': 'Assesses personality traits and behaviors critical for sales success.' }, { 'assessment_name': 'Customer Service Aptitude Test', 'assessment_url': 'https://www.shl.com/solutions/products/customer-service', 'category': 'Personality', 'test_type': 'P', 'description': 'Measures interpersonal skills and service orientation for customer-facing roles.' }, { 'assessment_name': 'Management Competency Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/management-competency', 'category': 'Leadership', 'test_type': 'P', 'description': 'Evaluates key management competencies including planning, organizing, and controlling.' }, # Additional mixed assessments { 'assessment_name': 'Graduate Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/graduate-assessment', 'category': 'General', 'test_type': 'K', 'description': 'Comprehensive assessment for graduate recruitment including cognitive and technical skills.' }, { 'assessment_name': 'Critical Thinking Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/critical-thinking', 'category': 'General', 'test_type': 'K', 'description': 'Evaluates analytical thinking, evaluation of arguments, and decision-making abilities.' }, { 'assessment_name': 'Business Acumen Test', 'assessment_url': 'https://www.shl.com/solutions/products/business-acumen', 'category': 'General', 'test_type': 'K', 'description': 'Assesses understanding of business principles, financial literacy, and strategic thinking.' }, { 'assessment_name': 'Project Management Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/project-management', 'category': 'Leadership', 'test_type': 'P', 'description': 'Evaluates project planning, resource management, and stakeholder communication skills.' }, { 'assessment_name': 'Communication Skills Assessment', 'assessment_url': 'https://www.shl.com/solutions/products/communication-skills', 'category': 'Verbal', 'test_type': 'P', 'description': 'Measures written and verbal communication effectiveness in professional contexts.' } ] df = pd.DataFrame(assessments) logger.info(f"Created fallback catalog with {len(df)} assessments") return df def save_to_csv(self, df: pd.DataFrame, filepath: str = 'data/shl_catalog.csv'): """Save catalog to CSV file""" try: df.to_csv(filepath, index=False, encoding='utf-8') logger.info(f"Catalog saved to {filepath}") except Exception as e: logger.error(f"Error saving catalog: {e}") def main(): """Main execution function""" crawler = SHLCrawler() catalog_df = crawler.scrape_catalog() # Save to CSV crawler.save_to_csv(catalog_df) print(f"\nCatalog Summary:") print(f"Total Assessments: {len(catalog_df)}") print(f"\nBy Test Type:") print(catalog_df['test_type'].value_counts()) print(f"\nBy Category:") print(catalog_df['category'].value_counts()) return catalog_df if __name__ == "__main__": main()