nsbecf / src /collectors /greenhouse.py
acarey5
new scrapping
851ce09
from __future__ import annotations
from typing import List
import requests
from src.collectors.common import dedupe_jobs, normalize_job_posting
from src.detectors.ats_detector import extract_ats_identifier
from src.models import CompanyRecord, JobPosting
from src.resolver.jobs_page_resolver import ResolvedJobsPage
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
"""Collect Greenhouse jobs via the public boards API."""
identifier = company_record.ats_identifier or extract_ats_identifier("greenhouse", resolved_page.url, resolved_page.html)
if not identifier:
return []
api_url = f"https://boards-api.greenhouse.io/v1/boards/{identifier}/jobs?content=true"
try:
response = requests.get(api_url, headers=HEADERS, timeout=20)
response.raise_for_status()
payload = response.json()
except Exception:
return []
jobs = [
normalize_job_posting(
company_record,
title=item.get("title", ""),
location=((item.get("location") or {}).get("name") or ""),
job_url=item.get("absolute_url", resolved_page.url),
description=(item.get("content") or ""),
department=(item.get("department") or ""),
source_ats="greenhouse",
resolved_url=resolved_page.url,
posted_date=str(item.get("updated_at") or ""),
raw_payload=item,
)
for item in payload.get("jobs", [])
if item.get("title")
]
return dedupe_jobs(jobs)