Spaces:

akshit7093
/

SHL

Sleeping

File size: 5,935 Bytes

655c38a

import requests
from bs4 import BeautifulSoup
import re
import logging
import time
from random import uniform
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def is_url(text):
    """Check if the input text is a URL."""
    url_pattern = re.compile(
        r'^(?:http|https)://'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+'
        r'(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
        r'localhost|'
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
        r'(?::\d+)?'
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    return bool(url_pattern.match(text))

def create_session_with_retry():
    """Create a requests session with retry capabilities."""
    session = requests.Session()
    retry_strategy = Retry(
        total=3,  # Maximum number of retries
        backoff_factor=1,  # Time factor between retries
        status_forcelist=[429, 500, 502, 503, 504],  # HTTP status codes to retry on
        allowed_methods=["GET", "POST"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

def scrape_job_description(url, max_retries=3):
    """Scrape job description content from a URL with retry mechanism."""
    retry_count = 0
    last_error = None
    
    # Different user agents to rotate
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
    ]
    
    while retry_count < max_retries:
        try:
            # Add random delay between retries to avoid rate limiting
            if retry_count > 0:
                sleep_time = uniform(1, 3) * retry_count
                logging.info(f"Retry {retry_count}/{max_retries} for {url}, waiting {sleep_time:.2f} seconds")
                time.sleep(sleep_time)
            
            # Rotate user agents
            headers = {
                'User-Agent': user_agents[retry_count % len(user_agents)],
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1'
            }
            
            # Use session with retry capabilities
            session = create_session_with_retry()
            response = session.get(url, headers=headers, timeout=15)  # Increased timeout
            response.raise_for_status()
        
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.extract()
            
            # Extract text from common job description containers
            job_content = ""
            
            # Look for common job description containers
            potential_containers = soup.select(
                '.job-description, .description, .job-details, '
                '#job-description, #description, #job-details, '
                '[class*="job"][class*="description"], '
                '[class*="job"][class*="details"], '
                '[id*="job"][id*="description"], '
                '[id*="job"][id*="details"]'
            )
            
            if potential_containers:
                for container in potential_containers:
                    job_content += container.get_text(separator='\n', strip=True) + "\n\n"
            else:
                # If no specific containers found, get the main content
                main_content = soup.select('main, article, .content, #content')
                if main_content:
                    for content in main_content:
                        job_content += content.get_text(separator='\n', strip=True) + "\n\n"
                else:
                    # Fallback to body content
                    job_content = soup.body.get_text(separator='\n', strip=True)
            
            # Clean up the text
            job_content = re.sub(r'\n+', '\n', job_content).strip()
            
            if not job_content:
                logging.warning(f"No content extracted from {url}")
                # Return a fallback message that won't cause issues with embedding
                return "No job description content could be extracted from the provided URL."
                
            logging.info(f"Successfully scraped content from {url}")
            return job_content
                
        except requests.exceptions.RequestException as e:
            logging.error(f"Error scraping {url} (attempt {retry_count+1}/{max_retries}): {str(e)}")
            last_error = e
            retry_count += 1
        except Exception as e:
            logging.error(f"Unexpected error scraping {url} (attempt {retry_count+1}/{max_retries}): {str(e)}")
            last_error = e
            retry_count += 1
    
    # If all retries failed, return a fallback message
    error_message = f"Failed to scrape job description after {max_retries} attempts: {str(last_error)}"
    logging.error(error_message)
    return "Unable to access the job description at this time. Please try again later or provide the job description text directly."

def extract_job_details(text):
    """Extract structured job details from scraped text."""
    # This function can be expanded to extract specific job details
    # like job title, required skills, experience level, etc.
    # For now, we'll just return the cleaned text
    return text.strip()