File size: 5,935 Bytes
655c38a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import requests
from bs4 import BeautifulSoup
import re
import logging
import time
from random import uniform
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def is_url(text):
    """Check if the input text is a URL."""
    url_pattern = re.compile(
        r'^(?:http|https)://'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+'
        r'(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
        r'localhost|'
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
        r'(?::\d+)?'
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    return bool(url_pattern.match(text))

def create_session_with_retry():
    """Create a requests session with retry capabilities."""
    session = requests.Session()
    retry_strategy = Retry(
        total=3,  # Maximum number of retries
        backoff_factor=1,  # Time factor between retries
        status_forcelist=[429, 500, 502, 503, 504],  # HTTP status codes to retry on
        allowed_methods=["GET", "POST"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

def scrape_job_description(url, max_retries=3):
    """Scrape job description content from a URL with retry mechanism."""
    retry_count = 0
    last_error = None
    
    # Different user agents to rotate
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
    ]
    
    while retry_count < max_retries:
        try:
            # Add random delay between retries to avoid rate limiting
            if retry_count > 0:
                sleep_time = uniform(1, 3) * retry_count
                logging.info(f"Retry {retry_count}/{max_retries} for {url}, waiting {sleep_time:.2f} seconds")
                time.sleep(sleep_time)
            
            # Rotate user agents
            headers = {
                'User-Agent': user_agents[retry_count % len(user_agents)],
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1'
            }
            
            # Use session with retry capabilities
            session = create_session_with_retry()
            response = session.get(url, headers=headers, timeout=15)  # Increased timeout
            response.raise_for_status()
        
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.extract()
            
            # Extract text from common job description containers
            job_content = ""
            
            # Look for common job description containers
            potential_containers = soup.select(
                '.job-description, .description, .job-details, '
                '#job-description, #description, #job-details, '
                '[class*="job"][class*="description"], '
                '[class*="job"][class*="details"], '
                '[id*="job"][id*="description"], '
                '[id*="job"][id*="details"]'
            )
            
            if potential_containers:
                for container in potential_containers:
                    job_content += container.get_text(separator='\n', strip=True) + "\n\n"
            else:
                # If no specific containers found, get the main content
                main_content = soup.select('main, article, .content, #content')
                if main_content:
                    for content in main_content:
                        job_content += content.get_text(separator='\n', strip=True) + "\n\n"
                else:
                    # Fallback to body content
                    job_content = soup.body.get_text(separator='\n', strip=True)
            
            # Clean up the text
            job_content = re.sub(r'\n+', '\n', job_content).strip()
            
            if not job_content:
                logging.warning(f"No content extracted from {url}")
                # Return a fallback message that won't cause issues with embedding
                return "No job description content could be extracted from the provided URL."
                
            logging.info(f"Successfully scraped content from {url}")
            return job_content
                
        except requests.exceptions.RequestException as e:
            logging.error(f"Error scraping {url} (attempt {retry_count+1}/{max_retries}): {str(e)}")
            last_error = e
            retry_count += 1
        except Exception as e:
            logging.error(f"Unexpected error scraping {url} (attempt {retry_count+1}/{max_retries}): {str(e)}")
            last_error = e
            retry_count += 1
    
    # If all retries failed, return a fallback message
    error_message = f"Failed to scrape job description after {max_retries} attempts: {str(last_error)}"
    logging.error(error_message)
    return "Unable to access the job description at this time. Please try again later or provide the job description text directly."

def extract_job_details(text):
    """Extract structured job details from scraped text."""
    # This function can be expanded to extract specific job details
    # like job title, required skills, experience level, etc.
    # For now, we'll just return the cleaned text
    return text.strip()