SHL / scraper.py
joker7094's picture
Initial commit with LFS-tracked index file
655c38a
import requests
from bs4 import BeautifulSoup
import re
import logging
import time
from random import uniform
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def is_url(text):
"""Check if the input text is a URL."""
url_pattern = re.compile(
r'^(?:http|https)://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+'
r'(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
r'localhost|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return bool(url_pattern.match(text))
def create_session_with_retry():
"""Create a requests session with retry capabilities."""
session = requests.Session()
retry_strategy = Retry(
total=3, # Maximum number of retries
backoff_factor=1, # Time factor between retries
status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
allowed_methods=["GET", "POST"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def scrape_job_description(url, max_retries=3):
"""Scrape job description content from a URL with retry mechanism."""
retry_count = 0
last_error = None
# Different user agents to rotate
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
]
while retry_count < max_retries:
try:
# Add random delay between retries to avoid rate limiting
if retry_count > 0:
sleep_time = uniform(1, 3) * retry_count
logging.info(f"Retry {retry_count}/{max_retries} for {url}, waiting {sleep_time:.2f} seconds")
time.sleep(sleep_time)
# Rotate user agents
headers = {
'User-Agent': user_agents[retry_count % len(user_agents)],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
# Use session with retry capabilities
session = create_session_with_retry()
response = session.get(url, headers=headers, timeout=15) # Increased timeout
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.extract()
# Extract text from common job description containers
job_content = ""
# Look for common job description containers
potential_containers = soup.select(
'.job-description, .description, .job-details, '
'#job-description, #description, #job-details, '
'[class*="job"][class*="description"], '
'[class*="job"][class*="details"], '
'[id*="job"][id*="description"], '
'[id*="job"][id*="details"]'
)
if potential_containers:
for container in potential_containers:
job_content += container.get_text(separator='\n', strip=True) + "\n\n"
else:
# If no specific containers found, get the main content
main_content = soup.select('main, article, .content, #content')
if main_content:
for content in main_content:
job_content += content.get_text(separator='\n', strip=True) + "\n\n"
else:
# Fallback to body content
job_content = soup.body.get_text(separator='\n', strip=True)
# Clean up the text
job_content = re.sub(r'\n+', '\n', job_content).strip()
if not job_content:
logging.warning(f"No content extracted from {url}")
# Return a fallback message that won't cause issues with embedding
return "No job description content could be extracted from the provided URL."
logging.info(f"Successfully scraped content from {url}")
return job_content
except requests.exceptions.RequestException as e:
logging.error(f"Error scraping {url} (attempt {retry_count+1}/{max_retries}): {str(e)}")
last_error = e
retry_count += 1
except Exception as e:
logging.error(f"Unexpected error scraping {url} (attempt {retry_count+1}/{max_retries}): {str(e)}")
last_error = e
retry_count += 1
# If all retries failed, return a fallback message
error_message = f"Failed to scrape job description after {max_retries} attempts: {str(last_error)}"
logging.error(error_message)
return "Unable to access the job description at this time. Please try again later or provide the job description text directly."
def extract_job_details(text):
"""Extract structured job details from scraped text."""
# This function can be expanded to extract specific job details
# like job title, required skills, experience level, etc.
# For now, we'll just return the cleaned text
return text.strip()