resfit / resumer /utils /scraper.py
Sajil Awale
Initial commit: ResFit - AI Resume Tailor
629d435
import requests
import trafilatura
import random
def scrape_job_details(url):
# 1. Setup headers to look like a real browser
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
]
headers = {
"User-Agent": random.choice(user_agents),
"Accept-Language": "en-US,en;q=0.9",
}
try:
# 2. Fetch the HTML manually using requests
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status() # Check for HTTP errors
# 3. Pass the raw HTML to trafilatura for extraction
# We use 'extract' on the response text directly
content = trafilatura.extract(
response.text,
include_formatting=True,
include_links=False,
favor_precision=True
)
if not content:
return "Error: Could not identify the main content of the page."
return content
except requests.exceptions.RequestException as e:
return f"Network error: {e}"
except Exception as e:
return f"An unexpected error occurred: {e}"
# # --- Usage ---
# url = "https://careers.qualcomm.com/careers/job/446715275527?hl=en-US&domain=qualcomm.com&source=APPLICANT_SOURCE-6-2"
# print(scrape_job_details(url))