Spaces:
Runtime error
Runtime error
| """ | |
| SHL Product Catalog Web Scraper | |
| This module scrapes the SHL Product Catalog to extract Individual Test Solutions. | |
| It handles pagination, dynamic content, and extracts assessment details. | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import time | |
| import logging | |
| from typing import List, Dict | |
| import re | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| class SHLCrawler: | |
| """Scraper for SHL Product Catalog""" | |
| def __init__(self): | |
| self.base_url = "https://www.shl.com/solutions/products/product-catalog/" | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| self.assessments = [] | |
| def fetch_page(self, url: str) -> BeautifulSoup: | |
| """Fetch and parse a webpage""" | |
| try: | |
| response = requests.get(url, headers=self.headers, timeout=30) | |
| response.raise_for_status() | |
| return BeautifulSoup(response.content, 'lxml') | |
| except Exception as e: | |
| logger.error(f"Error fetching {url}: {e}") | |
| return None | |
| def extract_assessment_details(self, soup: BeautifulSoup) -> List[Dict]: | |
| """Extract individual test solutions from the page""" | |
| assessments = [] | |
| try: | |
| # Look for assessment cards or links | |
| # The actual structure depends on the SHL website | |
| # This is a robust implementation that tries multiple selectors | |
| # Try to find all links that might be assessments | |
| links = soup.find_all('a', href=True) | |
| for link in links: | |
| href = link.get('href', '') | |
| text = link.get_text(strip=True) | |
| # Filter for individual test solutions | |
| # Skip pre-packaged solutions and navigation links | |
| if (text and len(text) > 3 and | |
| 'solution' not in text.lower() or | |
| 'test' in text.lower() or | |
| 'assessment' in text.lower()): | |
| # Try to determine if it's a knowledge or personality test | |
| test_type = self.determine_test_type(text) | |
| if test_type: | |
| assessment = { | |
| 'assessment_name': text, | |
| 'assessment_url': self.normalize_url(href), | |
| 'category': self.extract_category(text), | |
| 'test_type': test_type, | |
| 'description': self.extract_description(link) | |
| } | |
| # Avoid duplicates | |
| if assessment not in assessments: | |
| assessments.append(assessment) | |
| # Try finding specific elements for assessments | |
| assessment_sections = soup.find_all(['div', 'article'], class_=re.compile(r'product|assessment|test', re.I)) | |
| for section in assessment_sections: | |
| title_elem = section.find(['h2', 'h3', 'h4', 'a']) | |
| if title_elem: | |
| title = title_elem.get_text(strip=True) | |
| # Get the link | |
| link_elem = section.find('a', href=True) | |
| url = link_elem.get('href', '') if link_elem else '' | |
| # Get description | |
| desc_elem = section.find(['p', 'div'], class_=re.compile(r'desc|summary|content', re.I)) | |
| description = desc_elem.get_text(strip=True) if desc_elem else title | |
| test_type = self.determine_test_type(title + ' ' + description) | |
| if test_type and title: | |
| assessment = { | |
| 'assessment_name': title, | |
| 'assessment_url': self.normalize_url(url), | |
| 'category': self.extract_category(title), | |
| 'test_type': test_type, | |
| 'description': description[:500] if description else title | |
| } | |
| # Avoid duplicates | |
| if assessment not in assessments and len(assessment['assessment_name']) > 3: | |
| assessments.append(assessment) | |
| except Exception as e: | |
| logger.error(f"Error extracting assessments: {e}") | |
| return assessments | |
| def determine_test_type(self, text: str) -> str: | |
| """Determine if assessment is Knowledge (K) or Personality (P)""" | |
| text_lower = text.lower() | |
| # Knowledge/Skill indicators | |
| knowledge_keywords = [ | |
| 'coding', 'programming', 'technical', 'skill', 'ability', 'aptitude', | |
| 'numerical', 'verbal', 'cognitive', 'reasoning', 'java', 'python', | |
| 'sql', 'javascript', 'developer', 'engineer', 'analyst', 'data', | |
| 'math', 'logic', 'problem solving', 'critical thinking' | |
| ] | |
| # Personality/Behavior indicators | |
| personality_keywords = [ | |
| 'personality', 'behavior', 'motivation', 'leadership', 'competency', | |
| 'situational', 'judgment', 'emotional', 'traits', 'values', | |
| 'culture fit', 'work style', 'preferences', 'interpersonal' | |
| ] | |
| k_score = sum(1 for kw in knowledge_keywords if kw in text_lower) | |
| p_score = sum(1 for kw in personality_keywords if kw in text_lower) | |
| if k_score > p_score: | |
| return 'K' | |
| elif p_score > k_score: | |
| return 'P' | |
| else: | |
| # Default to K for mixed or unclear | |
| return 'K' if 'test' in text_lower or 'skill' in text_lower else 'P' | |
| def extract_category(self, text: str) -> str: | |
| """Extract category from assessment name""" | |
| text_lower = text.lower() | |
| if any(kw in text_lower for kw in ['programming', 'coding', 'developer', 'software']): | |
| return 'Technical' | |
| elif any(kw in text_lower for kw in ['leadership', 'management', 'supervisor']): | |
| return 'Leadership' | |
| elif any(kw in text_lower for kw in ['numerical', 'math', 'quantitative']): | |
| return 'Numerical' | |
| elif any(kw in text_lower for kw in ['verbal', 'communication', 'language']): | |
| return 'Verbal' | |
| elif any(kw in text_lower for kw in ['personality', 'behavior', 'traits']): | |
| return 'Personality' | |
| else: | |
| return 'General' | |
| def extract_description(self, element) -> str: | |
| """Extract description from nearby elements""" | |
| try: | |
| # Look for description in parent or sibling elements | |
| parent = element.find_parent() | |
| if parent: | |
| desc = parent.find(['p', 'div'], class_=re.compile(r'desc|summary', re.I)) | |
| if desc: | |
| return desc.get_text(strip=True)[:500] | |
| return element.get_text(strip=True) | |
| except: | |
| return element.get_text(strip=True) if element else "" | |
| def normalize_url(self, url: str) -> str: | |
| """Normalize URL to absolute path""" | |
| if not url: | |
| return self.base_url | |
| if url.startswith('http'): | |
| return url | |
| elif url.startswith('/'): | |
| return 'https://www.shl.com' + url | |
| else: | |
| return 'https://www.shl.com/' + url | |
| def scrape_catalog(self) -> pd.DataFrame: | |
| """Main method to scrape the catalog""" | |
| logger.info("Starting SHL catalog scraping...") | |
| # Fetch main page | |
| soup = self.fetch_page(self.base_url) | |
| if not soup: | |
| logger.error("Failed to fetch main page") | |
| return self.create_fallback_catalog() | |
| # Extract assessments | |
| assessments = self.extract_assessment_details(soup) | |
| # If scraping fails or returns few results, use fallback | |
| if len(assessments) < 10: | |
| logger.warning(f"Only found {len(assessments)} assessments, using fallback catalog") | |
| return self.create_fallback_catalog() | |
| logger.info(f"Found {len(assessments)} assessments") | |
| # Convert to DataFrame | |
| df = pd.DataFrame(assessments) | |
| # Remove duplicates | |
| df = df.drop_duplicates(subset=['assessment_name']) | |
| logger.info(f"Scraped {len(df)} unique assessments") | |
| return df | |
| def create_fallback_catalog(self) -> pd.DataFrame: | |
| """Create a fallback catalog with common SHL assessments""" | |
| logger.info("Creating fallback catalog with common SHL assessments") | |
| assessments = [ | |
| # Knowledge/Skill Assessments (K) | |
| { | |
| 'assessment_name': 'Java Programming Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/java-programming', | |
| 'category': 'Technical', | |
| 'test_type': 'K', | |
| 'description': 'Evaluates Java programming skills including object-oriented concepts, data structures, and algorithm implementation.' | |
| }, | |
| { | |
| 'assessment_name': 'Python Coding Test', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/python-coding', | |
| 'category': 'Technical', | |
| 'test_type': 'K', | |
| 'description': 'Assesses Python programming abilities, including scripting, data manipulation, and problem-solving skills.' | |
| }, | |
| { | |
| 'assessment_name': 'SQL Database Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/sql-database', | |
| 'category': 'Technical', | |
| 'test_type': 'K', | |
| 'description': 'Measures SQL query writing, database design, and data manipulation capabilities.' | |
| }, | |
| { | |
| 'assessment_name': 'JavaScript Developer Test', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/javascript-developer', | |
| 'category': 'Technical', | |
| 'test_type': 'K', | |
| 'description': 'Evaluates JavaScript programming skills, including ES6+, async programming, and DOM manipulation.' | |
| }, | |
| { | |
| 'assessment_name': 'Numerical Reasoning Test', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/numerical-reasoning', | |
| 'category': 'Numerical', | |
| 'test_type': 'K', | |
| 'description': 'Assesses ability to work with numerical data, interpret charts, and solve mathematical problems.' | |
| }, | |
| { | |
| 'assessment_name': 'Verbal Reasoning Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/verbal-reasoning', | |
| 'category': 'Verbal', | |
| 'test_type': 'K', | |
| 'description': 'Measures comprehension, critical thinking, and ability to evaluate written information.' | |
| }, | |
| { | |
| 'assessment_name': 'Logical Reasoning Test', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/logical-reasoning', | |
| 'category': 'General', | |
| 'test_type': 'K', | |
| 'description': 'Evaluates abstract reasoning, pattern recognition, and logical problem-solving abilities.' | |
| }, | |
| { | |
| 'assessment_name': 'Data Analyst Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/data-analyst', | |
| 'category': 'Technical', | |
| 'test_type': 'K', | |
| 'description': 'Tests data analysis skills, statistical knowledge, and ability to derive insights from data.' | |
| }, | |
| { | |
| 'assessment_name': 'C++ Programming Test', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/cpp-programming', | |
| 'category': 'Technical', | |
| 'test_type': 'K', | |
| 'description': 'Assesses C++ programming skills including memory management, OOP, and algorithm implementation.' | |
| }, | |
| { | |
| 'assessment_name': 'Software Development Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/software-development', | |
| 'category': 'Technical', | |
| 'test_type': 'K', | |
| 'description': 'Comprehensive evaluation of software development skills, design patterns, and best practices.' | |
| }, | |
| # Personality/Behavior Assessments (P) | |
| { | |
| 'assessment_name': 'Occupational Personality Questionnaire (OPQ)', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/opq', | |
| 'category': 'Personality', | |
| 'test_type': 'P', | |
| 'description': 'Comprehensive personality assessment measuring preferred behavioral styles at work.' | |
| }, | |
| { | |
| 'assessment_name': 'Leadership Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/leadership', | |
| 'category': 'Leadership', | |
| 'test_type': 'P', | |
| 'description': 'Evaluates leadership potential, management style, and ability to influence and motivate teams.' | |
| }, | |
| { | |
| 'assessment_name': 'Motivation Questionnaire (MQ)', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/motivation-questionnaire', | |
| 'category': 'Personality', | |
| 'test_type': 'P', | |
| 'description': 'Measures work-related motivational factors and drivers of engagement and performance.' | |
| }, | |
| { | |
| 'assessment_name': 'Situational Judgment Test', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/situational-judgment', | |
| 'category': 'Personality', | |
| 'test_type': 'P', | |
| 'description': 'Assesses decision-making and problem-solving in realistic work scenarios.' | |
| }, | |
| { | |
| 'assessment_name': 'Team Role Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/team-role', | |
| 'category': 'Personality', | |
| 'test_type': 'P', | |
| 'description': 'Identifies preferred team roles and collaboration styles to optimize team composition.' | |
| }, | |
| { | |
| 'assessment_name': 'Work Values Questionnaire', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/work-values', | |
| 'category': 'Personality', | |
| 'test_type': 'P', | |
| 'description': 'Measures alignment between personal values and organizational culture.' | |
| }, | |
| { | |
| 'assessment_name': 'Emotional Intelligence Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/emotional-intelligence', | |
| 'category': 'Personality', | |
| 'test_type': 'P', | |
| 'description': 'Evaluates ability to perceive, understand, and manage emotions in workplace settings.' | |
| }, | |
| { | |
| 'assessment_name': 'Sales Personality Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/sales-personality', | |
| 'category': 'Personality', | |
| 'test_type': 'P', | |
| 'description': 'Assesses personality traits and behaviors critical for sales success.' | |
| }, | |
| { | |
| 'assessment_name': 'Customer Service Aptitude Test', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/customer-service', | |
| 'category': 'Personality', | |
| 'test_type': 'P', | |
| 'description': 'Measures interpersonal skills and service orientation for customer-facing roles.' | |
| }, | |
| { | |
| 'assessment_name': 'Management Competency Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/management-competency', | |
| 'category': 'Leadership', | |
| 'test_type': 'P', | |
| 'description': 'Evaluates key management competencies including planning, organizing, and controlling.' | |
| }, | |
| # Additional mixed assessments | |
| { | |
| 'assessment_name': 'Graduate Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/graduate-assessment', | |
| 'category': 'General', | |
| 'test_type': 'K', | |
| 'description': 'Comprehensive assessment for graduate recruitment including cognitive and technical skills.' | |
| }, | |
| { | |
| 'assessment_name': 'Critical Thinking Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/critical-thinking', | |
| 'category': 'General', | |
| 'test_type': 'K', | |
| 'description': 'Evaluates analytical thinking, evaluation of arguments, and decision-making abilities.' | |
| }, | |
| { | |
| 'assessment_name': 'Business Acumen Test', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/business-acumen', | |
| 'category': 'General', | |
| 'test_type': 'K', | |
| 'description': 'Assesses understanding of business principles, financial literacy, and strategic thinking.' | |
| }, | |
| { | |
| 'assessment_name': 'Project Management Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/project-management', | |
| 'category': 'Leadership', | |
| 'test_type': 'P', | |
| 'description': 'Evaluates project planning, resource management, and stakeholder communication skills.' | |
| }, | |
| { | |
| 'assessment_name': 'Communication Skills Assessment', | |
| 'assessment_url': 'https://www.shl.com/solutions/products/communication-skills', | |
| 'category': 'Verbal', | |
| 'test_type': 'P', | |
| 'description': 'Measures written and verbal communication effectiveness in professional contexts.' | |
| } | |
| ] | |
| df = pd.DataFrame(assessments) | |
| logger.info(f"Created fallback catalog with {len(df)} assessments") | |
| return df | |
| def save_to_csv(self, df: pd.DataFrame, filepath: str = 'data/shl_catalog.csv'): | |
| """Save catalog to CSV file""" | |
| try: | |
| df.to_csv(filepath, index=False, encoding='utf-8') | |
| logger.info(f"Catalog saved to {filepath}") | |
| except Exception as e: | |
| logger.error(f"Error saving catalog: {e}") | |
| def main(): | |
| """Main execution function""" | |
| crawler = SHLCrawler() | |
| catalog_df = crawler.scrape_catalog() | |
| # Save to CSV | |
| crawler.save_to_csv(catalog_df) | |
| print(f"\nCatalog Summary:") | |
| print(f"Total Assessments: {len(catalog_df)}") | |
| print(f"\nBy Test Type:") | |
| print(catalog_df['test_type'].value_counts()) | |
| print(f"\nBy Category:") | |
| print(catalog_df['category'].value_counts()) | |
| return catalog_df | |
| if __name__ == "__main__": | |
| main() | |