File size: 1,493 Bytes
629d435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import requests
import trafilatura
import random

def scrape_job_details(url):
    # 1. Setup headers to look like a real browser
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    ]
    
    headers = {
        "User-Agent": random.choice(user_agents),
        "Accept-Language": "en-US,en;q=0.9",
    }

    try:
        # 2. Fetch the HTML manually using requests
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() # Check for HTTP errors
        
        # 3. Pass the raw HTML to trafilatura for extraction
        # We use 'extract' on the response text directly
        content = trafilatura.extract(
            response.text, 
            include_formatting=True,
            include_links=False,
            favor_precision=True
        )

        if not content:
            return "Error: Could not identify the main content of the page."

        return content

    except requests.exceptions.RequestException as e:
        return f"Network error: {e}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"

# # --- Usage ---
# url = "https://careers.qualcomm.com/careers/job/446715275527?hl=en-US&domain=qualcomm.com&source=APPLICANT_SOURCE-6-2"
# print(scrape_job_details(url))