Spaces:
Running
Running
| import requests | |
| import trafilatura | |
| import random | |
| def scrape_job_details(url): | |
| # 1. Setup headers to look like a real browser | |
| user_agents = [ | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
| ] | |
| headers = { | |
| "User-Agent": random.choice(user_agents), | |
| "Accept-Language": "en-US,en;q=0.9", | |
| } | |
| try: | |
| # 2. Fetch the HTML manually using requests | |
| response = requests.get(url, headers=headers, timeout=15) | |
| response.raise_for_status() # Check for HTTP errors | |
| # 3. Pass the raw HTML to trafilatura for extraction | |
| # We use 'extract' on the response text directly | |
| content = trafilatura.extract( | |
| response.text, | |
| include_formatting=True, | |
| include_links=False, | |
| favor_precision=True | |
| ) | |
| if not content: | |
| return "Error: Could not identify the main content of the page." | |
| return content | |
| except requests.exceptions.RequestException as e: | |
| return f"Network error: {e}" | |
| except Exception as e: | |
| return f"An unexpected error occurred: {e}" | |
| # # --- Usage --- | |
| # url = "https://careers.qualcomm.com/careers/job/446715275527?hl=en-US&domain=qualcomm.com&source=APPLICANT_SOURCE-6-2" | |
| # print(scrape_job_details(url)) |