Spaces:

akshit7093
/

SHL

Sleeping

App Files Files Community

SHL / scraper.py

joker7094

Initial commit with LFS-tracked index file

655c38a 11 months ago

raw

history blame contribute delete

5.94 kB

	import requests
	from bs4 import BeautifulSoup
	import re
	import logging
	import time
	from random import uniform
	from requests.adapters import HTTPAdapter
	from urllib3.util.retry import Retry

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	def is_url(text):
	"""Check if the input text is a URL."""
	url_pattern = re.compile(
	r'^(?:http\|https)://'
	r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+'
	r'(?:[A-Z]{2,6}\.?\|[A-Z0-9-]{2,}\.?)\|'
	r'localhost\|'
	r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
	r'(?::\d+)?'
	r'(?:/?\|[/?]\S+)$', re.IGNORECASE)
	return bool(url_pattern.match(text))

	def create_session_with_retry():
	"""Create a requests session with retry capabilities."""
	session = requests.Session()
	retry_strategy = Retry(
	total=3, # Maximum number of retries
	backoff_factor=1, # Time factor between retries
	status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
	allowed_methods=["GET", "POST"]
	)
	adapter = HTTPAdapter(max_retries=retry_strategy)
	session.mount("http://", adapter)
	session.mount("https://", adapter)
	return session

	def scrape_job_description(url, max_retries=3):
	"""Scrape job description content from a URL with retry mechanism."""
	retry_count = 0
	last_error = None

	# Different user agents to rotate
	user_agents = [
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
	]

	while retry_count < max_retries:
	try:
	# Add random delay between retries to avoid rate limiting
	if retry_count > 0:
	sleep_time = uniform(1, 3) * retry_count
	logging.info(f"Retry {retry_count}/{max_retries} for {url}, waiting {sleep_time:.2f} seconds")
	time.sleep(sleep_time)

	# Rotate user agents
	headers = {
	'User-Agent': user_agents[retry_count % len(user_agents)],
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1'
	}

	# Use session with retry capabilities
	session = create_session_with_retry()
	response = session.get(url, headers=headers, timeout=15) # Increased timeout
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.extract()

	# Extract text from common job description containers
	job_content = ""

	# Look for common job description containers
	potential_containers = soup.select(
	'.job-description, .description, .job-details, '
	'#job-description, #description, #job-details, '
	'[class="job"][class="description"], '
	'[class="job"][class="details"], '
	'[id="job"][id="description"], '
	'[id="job"][id="details"]'
	)

	if potential_containers:
	for container in potential_containers:
	job_content += container.get_text(separator='\n', strip=True) + "\n\n"
	else:
	# If no specific containers found, get the main content
	main_content = soup.select('main, article, .content, #content')
	if main_content:
	for content in main_content:
	job_content += content.get_text(separator='\n', strip=True) + "\n\n"
	else:
	# Fallback to body content
	job_content = soup.body.get_text(separator='\n', strip=True)

	# Clean up the text
	job_content = re.sub(r'\n+', '\n', job_content).strip()

	if not job_content:
	logging.warning(f"No content extracted from {url}")
	# Return a fallback message that won't cause issues with embedding
	return "No job description content could be extracted from the provided URL."

	logging.info(f"Successfully scraped content from {url}")
	return job_content

	except requests.exceptions.RequestException as e:
	logging.error(f"Error scraping {url} (attempt {retry_count+1}/{max_retries}): {str(e)}")
	last_error = e
	retry_count += 1
	except Exception as e:
	logging.error(f"Unexpected error scraping {url} (attempt {retry_count+1}/{max_retries}): {str(e)}")
	last_error = e
	retry_count += 1

	# If all retries failed, return a fallback message
	error_message = f"Failed to scrape job description after {max_retries} attempts: {str(last_error)}"
	logging.error(error_message)
	return "Unable to access the job description at this time. Please try again later or provide the job description text directly."

	def extract_job_details(text):
	"""Extract structured job details from scraped text."""
	# This function can be expanded to extract specific job details
	# like job title, required skills, experience level, etc.
	# For now, we'll just return the cleaned text
	return text.strip()