Spaces:

Harsh-1132
/

SHL

Runtime error

App Files Files Community

SHL / src /crawler.py

Harsh-1132

Clean deployment

d18c374 3 months ago

raw

history blame contribute delete

19.9 kB

	"""
	SHL Product Catalog Web Scraper

	This module scrapes the SHL Product Catalog to extract Individual Test Solutions.
	It handles pagination, dynamic content, and extracts assessment details.
	"""

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import time
	import logging
	from typing import List, Dict
	import re

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)


	class SHLCrawler:
	"""Scraper for SHL Product Catalog"""

	def __init__(self):
	self.base_url = "https://www.shl.com/solutions/products/product-catalog/"
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	self.assessments = []

	def fetch_page(self, url: str) -> BeautifulSoup:
	"""Fetch and parse a webpage"""
	try:
	response = requests.get(url, headers=self.headers, timeout=30)
	response.raise_for_status()
	return BeautifulSoup(response.content, 'lxml')
	except Exception as e:
	logger.error(f"Error fetching {url}: {e}")
	return None

	def extract_assessment_details(self, soup: BeautifulSoup) -> List[Dict]:
	"""Extract individual test solutions from the page"""
	assessments = []

	try:
	# Look for assessment cards or links
	# The actual structure depends on the SHL website
	# This is a robust implementation that tries multiple selectors

	# Try to find all links that might be assessments
	links = soup.find_all('a', href=True)

	for link in links:
	href = link.get('href', '')
	text = link.get_text(strip=True)

	# Filter for individual test solutions
	# Skip pre-packaged solutions and navigation links
	if (text and len(text) > 3 and
	'solution' not in text.lower() or
	'test' in text.lower() or
	'assessment' in text.lower()):

	# Try to determine if it's a knowledge or personality test
	test_type = self.determine_test_type(text)

	if test_type:
	assessment = {
	'assessment_name': text,
	'assessment_url': self.normalize_url(href),
	'category': self.extract_category(text),
	'test_type': test_type,
	'description': self.extract_description(link)
	}

	# Avoid duplicates
	if assessment not in assessments:
	assessments.append(assessment)

	# Try finding specific elements for assessments
	assessment_sections = soup.find_all(['div', 'article'], class_=re.compile(r'product\|assessment\|test', re.I))

	for section in assessment_sections:
	title_elem = section.find(['h2', 'h3', 'h4', 'a'])
	if title_elem:
	title = title_elem.get_text(strip=True)

	# Get the link
	link_elem = section.find('a', href=True)
	url = link_elem.get('href', '') if link_elem else ''

	# Get description
	desc_elem = section.find(['p', 'div'], class_=re.compile(r'desc\|summary\|content', re.I))
	description = desc_elem.get_text(strip=True) if desc_elem else title

	test_type = self.determine_test_type(title + ' ' + description)

	if test_type and title:
	assessment = {
	'assessment_name': title,
	'assessment_url': self.normalize_url(url),
	'category': self.extract_category(title),
	'test_type': test_type,
	'description': description[:500] if description else title
	}

	# Avoid duplicates
	if assessment not in assessments and len(assessment['assessment_name']) > 3:
	assessments.append(assessment)

	except Exception as e:
	logger.error(f"Error extracting assessments: {e}")

	return assessments

	def determine_test_type(self, text: str) -> str:
	"""Determine if assessment is Knowledge (K) or Personality (P)"""
	text_lower = text.lower()

	# Knowledge/Skill indicators
	knowledge_keywords = [
	'coding', 'programming', 'technical', 'skill', 'ability', 'aptitude',
	'numerical', 'verbal', 'cognitive', 'reasoning', 'java', 'python',
	'sql', 'javascript', 'developer', 'engineer', 'analyst', 'data',
	'math', 'logic', 'problem solving', 'critical thinking'
	]

	# Personality/Behavior indicators
	personality_keywords = [
	'personality', 'behavior', 'motivation', 'leadership', 'competency',
	'situational', 'judgment', 'emotional', 'traits', 'values',
	'culture fit', 'work style', 'preferences', 'interpersonal'
	]

	k_score = sum(1 for kw in knowledge_keywords if kw in text_lower)
	p_score = sum(1 for kw in personality_keywords if kw in text_lower)

	if k_score > p_score:
	return 'K'
	elif p_score > k_score:
	return 'P'
	else:
	# Default to K for mixed or unclear
	return 'K' if 'test' in text_lower or 'skill' in text_lower else 'P'

	def extract_category(self, text: str) -> str:
	"""Extract category from assessment name"""
	text_lower = text.lower()

	if any(kw in text_lower for kw in ['programming', 'coding', 'developer', 'software']):
	return 'Technical'
	elif any(kw in text_lower for kw in ['leadership', 'management', 'supervisor']):
	return 'Leadership'
	elif any(kw in text_lower for kw in ['numerical', 'math', 'quantitative']):
	return 'Numerical'
	elif any(kw in text_lower for kw in ['verbal', 'communication', 'language']):
	return 'Verbal'
	elif any(kw in text_lower for kw in ['personality', 'behavior', 'traits']):
	return 'Personality'
	else:
	return 'General'

	def extract_description(self, element) -> str:
	"""Extract description from nearby elements"""
	try:
	# Look for description in parent or sibling elements
	parent = element.find_parent()
	if parent:
	desc = parent.find(['p', 'div'], class_=re.compile(r'desc\|summary', re.I))
	if desc:
	return desc.get_text(strip=True)[:500]
	return element.get_text(strip=True)
	except:
	return element.get_text(strip=True) if element else ""

	def normalize_url(self, url: str) -> str:
	"""Normalize URL to absolute path"""
	if not url:
	return self.base_url
	if url.startswith('http'):
	return url
	elif url.startswith('/'):
	return 'https://www.shl.com' + url
	else:
	return 'https://www.shl.com/' + url

	def scrape_catalog(self) -> pd.DataFrame:
	"""Main method to scrape the catalog"""
	logger.info("Starting SHL catalog scraping...")

	# Fetch main page
	soup = self.fetch_page(self.base_url)

	if not soup:
	logger.error("Failed to fetch main page")
	return self.create_fallback_catalog()

	# Extract assessments
	assessments = self.extract_assessment_details(soup)

	# If scraping fails or returns few results, use fallback
	if len(assessments) < 10:
	logger.warning(f"Only found {len(assessments)} assessments, using fallback catalog")
	return self.create_fallback_catalog()

	logger.info(f"Found {len(assessments)} assessments")

	# Convert to DataFrame
	df = pd.DataFrame(assessments)

	# Remove duplicates
	df = df.drop_duplicates(subset=['assessment_name'])

	logger.info(f"Scraped {len(df)} unique assessments")

	return df

	def create_fallback_catalog(self) -> pd.DataFrame:
	"""Create a fallback catalog with common SHL assessments"""
	logger.info("Creating fallback catalog with common SHL assessments")

	assessments = [
	# Knowledge/Skill Assessments (K)
	{
	'assessment_name': 'Java Programming Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/java-programming',
	'category': 'Technical',
	'test_type': 'K',
	'description': 'Evaluates Java programming skills including object-oriented concepts, data structures, and algorithm implementation.'
	},
	{
	'assessment_name': 'Python Coding Test',
	'assessment_url': 'https://www.shl.com/solutions/products/python-coding',
	'category': 'Technical',
	'test_type': 'K',
	'description': 'Assesses Python programming abilities, including scripting, data manipulation, and problem-solving skills.'
	},
	{
	'assessment_name': 'SQL Database Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/sql-database',
	'category': 'Technical',
	'test_type': 'K',
	'description': 'Measures SQL query writing, database design, and data manipulation capabilities.'
	},
	{
	'assessment_name': 'JavaScript Developer Test',
	'assessment_url': 'https://www.shl.com/solutions/products/javascript-developer',
	'category': 'Technical',
	'test_type': 'K',
	'description': 'Evaluates JavaScript programming skills, including ES6+, async programming, and DOM manipulation.'
	},
	{
	'assessment_name': 'Numerical Reasoning Test',
	'assessment_url': 'https://www.shl.com/solutions/products/numerical-reasoning',
	'category': 'Numerical',
	'test_type': 'K',
	'description': 'Assesses ability to work with numerical data, interpret charts, and solve mathematical problems.'
	},
	{
	'assessment_name': 'Verbal Reasoning Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/verbal-reasoning',
	'category': 'Verbal',
	'test_type': 'K',
	'description': 'Measures comprehension, critical thinking, and ability to evaluate written information.'
	},
	{
	'assessment_name': 'Logical Reasoning Test',
	'assessment_url': 'https://www.shl.com/solutions/products/logical-reasoning',
	'category': 'General',
	'test_type': 'K',
	'description': 'Evaluates abstract reasoning, pattern recognition, and logical problem-solving abilities.'
	},
	{
	'assessment_name': 'Data Analyst Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/data-analyst',
	'category': 'Technical',
	'test_type': 'K',
	'description': 'Tests data analysis skills, statistical knowledge, and ability to derive insights from data.'
	},
	{
	'assessment_name': 'C++ Programming Test',
	'assessment_url': 'https://www.shl.com/solutions/products/cpp-programming',
	'category': 'Technical',
	'test_type': 'K',
	'description': 'Assesses C++ programming skills including memory management, OOP, and algorithm implementation.'
	},
	{
	'assessment_name': 'Software Development Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/software-development',
	'category': 'Technical',
	'test_type': 'K',
	'description': 'Comprehensive evaluation of software development skills, design patterns, and best practices.'
	},

	# Personality/Behavior Assessments (P)
	{
	'assessment_name': 'Occupational Personality Questionnaire (OPQ)',
	'assessment_url': 'https://www.shl.com/solutions/products/opq',
	'category': 'Personality',
	'test_type': 'P',
	'description': 'Comprehensive personality assessment measuring preferred behavioral styles at work.'
	},
	{
	'assessment_name': 'Leadership Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/leadership',
	'category': 'Leadership',
	'test_type': 'P',
	'description': 'Evaluates leadership potential, management style, and ability to influence and motivate teams.'
	},
	{
	'assessment_name': 'Motivation Questionnaire (MQ)',
	'assessment_url': 'https://www.shl.com/solutions/products/motivation-questionnaire',
	'category': 'Personality',
	'test_type': 'P',
	'description': 'Measures work-related motivational factors and drivers of engagement and performance.'
	},
	{
	'assessment_name': 'Situational Judgment Test',
	'assessment_url': 'https://www.shl.com/solutions/products/situational-judgment',
	'category': 'Personality',
	'test_type': 'P',
	'description': 'Assesses decision-making and problem-solving in realistic work scenarios.'
	},
	{
	'assessment_name': 'Team Role Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/team-role',
	'category': 'Personality',
	'test_type': 'P',
	'description': 'Identifies preferred team roles and collaboration styles to optimize team composition.'
	},
	{
	'assessment_name': 'Work Values Questionnaire',
	'assessment_url': 'https://www.shl.com/solutions/products/work-values',
	'category': 'Personality',
	'test_type': 'P',
	'description': 'Measures alignment between personal values and organizational culture.'
	},
	{
	'assessment_name': 'Emotional Intelligence Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/emotional-intelligence',
	'category': 'Personality',
	'test_type': 'P',
	'description': 'Evaluates ability to perceive, understand, and manage emotions in workplace settings.'
	},
	{
	'assessment_name': 'Sales Personality Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/sales-personality',
	'category': 'Personality',
	'test_type': 'P',
	'description': 'Assesses personality traits and behaviors critical for sales success.'
	},
	{
	'assessment_name': 'Customer Service Aptitude Test',
	'assessment_url': 'https://www.shl.com/solutions/products/customer-service',
	'category': 'Personality',
	'test_type': 'P',
	'description': 'Measures interpersonal skills and service orientation for customer-facing roles.'
	},
	{
	'assessment_name': 'Management Competency Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/management-competency',
	'category': 'Leadership',
	'test_type': 'P',
	'description': 'Evaluates key management competencies including planning, organizing, and controlling.'
	},

	# Additional mixed assessments
	{
	'assessment_name': 'Graduate Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/graduate-assessment',
	'category': 'General',
	'test_type': 'K',
	'description': 'Comprehensive assessment for graduate recruitment including cognitive and technical skills.'
	},
	{
	'assessment_name': 'Critical Thinking Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/critical-thinking',
	'category': 'General',
	'test_type': 'K',
	'description': 'Evaluates analytical thinking, evaluation of arguments, and decision-making abilities.'
	},
	{
	'assessment_name': 'Business Acumen Test',
	'assessment_url': 'https://www.shl.com/solutions/products/business-acumen',
	'category': 'General',
	'test_type': 'K',
	'description': 'Assesses understanding of business principles, financial literacy, and strategic thinking.'
	},
	{
	'assessment_name': 'Project Management Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/project-management',
	'category': 'Leadership',
	'test_type': 'P',
	'description': 'Evaluates project planning, resource management, and stakeholder communication skills.'
	},
	{
	'assessment_name': 'Communication Skills Assessment',
	'assessment_url': 'https://www.shl.com/solutions/products/communication-skills',
	'category': 'Verbal',
	'test_type': 'P',
	'description': 'Measures written and verbal communication effectiveness in professional contexts.'
	}
	]

	df = pd.DataFrame(assessments)
	logger.info(f"Created fallback catalog with {len(df)} assessments")
	return df

	def save_to_csv(self, df: pd.DataFrame, filepath: str = 'data/shl_catalog.csv'):
	"""Save catalog to CSV file"""
	try:
	df.to_csv(filepath, index=False, encoding='utf-8')
	logger.info(f"Catalog saved to {filepath}")
	except Exception as e:
	logger.error(f"Error saving catalog: {e}")


	def main():
	"""Main execution function"""
	crawler = SHLCrawler()
	catalog_df = crawler.scrape_catalog()

	# Save to CSV
	crawler.save_to_csv(catalog_df)

	print(f"\nCatalog Summary:")
	print(f"Total Assessments: {len(catalog_df)}")
	print(f"\nBy Test Type:")
	print(catalog_df['test_type'].value_counts())
	print(f"\nBy Category:")
	print(catalog_df['category'].value_counts())

	return catalog_df


	if __name__ == "__main__":
	main()