Spaces:
Sleeping
Sleeping
File size: 5,935 Bytes
655c38a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | import requests
from bs4 import BeautifulSoup
import re
import logging
import time
from random import uniform
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def is_url(text):
"""Check if the input text is a URL."""
url_pattern = re.compile(
r'^(?:http|https)://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+'
r'(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
r'localhost|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return bool(url_pattern.match(text))
def create_session_with_retry():
"""Create a requests session with retry capabilities."""
session = requests.Session()
retry_strategy = Retry(
total=3, # Maximum number of retries
backoff_factor=1, # Time factor between retries
status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
allowed_methods=["GET", "POST"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def scrape_job_description(url, max_retries=3):
"""Scrape job description content from a URL with retry mechanism."""
retry_count = 0
last_error = None
# Different user agents to rotate
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
]
while retry_count < max_retries:
try:
# Add random delay between retries to avoid rate limiting
if retry_count > 0:
sleep_time = uniform(1, 3) * retry_count
logging.info(f"Retry {retry_count}/{max_retries} for {url}, waiting {sleep_time:.2f} seconds")
time.sleep(sleep_time)
# Rotate user agents
headers = {
'User-Agent': user_agents[retry_count % len(user_agents)],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
# Use session with retry capabilities
session = create_session_with_retry()
response = session.get(url, headers=headers, timeout=15) # Increased timeout
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.extract()
# Extract text from common job description containers
job_content = ""
# Look for common job description containers
potential_containers = soup.select(
'.job-description, .description, .job-details, '
'#job-description, #description, #job-details, '
'[class*="job"][class*="description"], '
'[class*="job"][class*="details"], '
'[id*="job"][id*="description"], '
'[id*="job"][id*="details"]'
)
if potential_containers:
for container in potential_containers:
job_content += container.get_text(separator='\n', strip=True) + "\n\n"
else:
# If no specific containers found, get the main content
main_content = soup.select('main, article, .content, #content')
if main_content:
for content in main_content:
job_content += content.get_text(separator='\n', strip=True) + "\n\n"
else:
# Fallback to body content
job_content = soup.body.get_text(separator='\n', strip=True)
# Clean up the text
job_content = re.sub(r'\n+', '\n', job_content).strip()
if not job_content:
logging.warning(f"No content extracted from {url}")
# Return a fallback message that won't cause issues with embedding
return "No job description content could be extracted from the provided URL."
logging.info(f"Successfully scraped content from {url}")
return job_content
except requests.exceptions.RequestException as e:
logging.error(f"Error scraping {url} (attempt {retry_count+1}/{max_retries}): {str(e)}")
last_error = e
retry_count += 1
except Exception as e:
logging.error(f"Unexpected error scraping {url} (attempt {retry_count+1}/{max_retries}): {str(e)}")
last_error = e
retry_count += 1
# If all retries failed, return a fallback message
error_message = f"Failed to scrape job description after {max_retries} attempts: {str(last_error)}"
logging.error(error_message)
return "Unable to access the job description at this time. Please try again later or provide the job description text directly."
def extract_job_details(text):
"""Extract structured job details from scraped text."""
# This function can be expanded to extract specific job details
# like job title, required skills, experience level, etc.
# For now, we'll just return the cleaned text
return text.strip() |