|
|
|
|
|
""" |
|
|
Debug script to see what content we're scraping from job postings. |
|
|
""" |
|
|
|
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
def debug_scrape(url: str): |
|
|
"""Debug scraping of a job posting URL.""" |
|
|
try: |
|
|
headers = { |
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" |
|
|
} |
|
|
response = requests.get(url, headers=headers, timeout=10) |
|
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser") |
|
|
|
|
|
|
|
|
for script in soup(["script", "style"]): |
|
|
script.decompose() |
|
|
|
|
|
|
|
|
text = soup.get_text() |
|
|
lines = (line.strip() for line in text.splitlines()) |
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
|
text = " ".join(chunk for chunk in chunks if chunk) |
|
|
|
|
|
print("=== SCRAPED CONTENT ===") |
|
|
print(text[:2000]) |
|
|
print("\n=== END SCRAPED CONTENT ===") |
|
|
|
|
|
|
|
|
print("\n=== LOOKING FOR JOB TITLE ===") |
|
|
title_elements = soup.find_all(['h1', 'h2', 'h3', 'title']) |
|
|
for elem in title_elements[:10]: |
|
|
if elem.get_text().strip(): |
|
|
print(f"Tag: {elem.name}, Text: {elem.get_text().strip()}") |
|
|
|
|
|
print("\n=== LOOKING FOR COMPANY INFO ===") |
|
|
company_elements = soup.find_all(text=lambda text: text and 'microsoft' in text.lower()) |
|
|
for elem in company_elements[:5]: |
|
|
print(f"Company text: {elem.strip()}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error scraping URL: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
url = "https://jobs.careers.microsoft.com/global/en/job/1829758/Applied-Scientist-II-and-Senior-Applied-Scientist-(Multiple-Positions)---Office-AI-Platform-team" |
|
|
debug_scrape(url) |