Spaces:
Sleeping
Sleeping
acarey5 commited on
Commit ·
851ce09
1
Parent(s): b8d2d77
new scrapping
Browse files- data/cached_jobs.json +0 -0
- data/nsbe_companies.csv +7 -0
- src/cache.py +50 -0
- src/collect_jobs.py +160 -0
- src/collectors/__init__.py +1 -0
- src/collectors/common.py +78 -0
- src/collectors/generic.py +53 -0
- src/collectors/greenhouse.py +45 -0
- src/collectors/lever.py +46 -0
- src/collectors/smartrecruiters.py +46 -0
- src/collectors/workday.py +18 -0
- src/company_loader.py +63 -0
- src/detectors/__init__.py +1 -0
- src/detectors/ats_detector.py +60 -0
- src/models.py +19 -2
- src/resolver/__init__.py +1 -0
- src/resolver/jobs_page_resolver.py +217 -0
data/cached_jobs.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/nsbe_companies.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
company,careers_url,ats_type,ats_identifier,priority
|
| 2 |
+
Stripe,https://stripe.com/jobs/search,greenhouse,stripe,10
|
| 3 |
+
Netflix,https://jobs.netflix.com/,lever,netflix,10
|
| 4 |
+
Capital One,https://www.capitalonecareers.com/,workday,,9
|
| 5 |
+
Chevron,https://chevron.wd5.myworkdayjobs.com/ChevronCareers,workday,,8
|
| 6 |
+
ServiceNow,https://careers.servicenow.com/careers,smartrecruiters,,8
|
| 7 |
+
ABB,https://global.abb/group/en/careers,generic,,7
|
src/cache.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Iterable, List, Sequence
|
| 7 |
+
|
| 8 |
+
from src.models import JobPosting
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def load_cached_jobs(cache_path: str | Path) -> dict:
|
| 12 |
+
path = Path(cache_path)
|
| 13 |
+
if not path.exists():
|
| 14 |
+
return {"generated_at": "", "companies": [], "jobs": []}
|
| 15 |
+
try:
|
| 16 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 17 |
+
except Exception:
|
| 18 |
+
return {"generated_at": "", "companies": [], "jobs": []}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def write_cached_jobs(cache_path: str | Path, jobs: Sequence[JobPosting], companies: Iterable[str]) -> Path:
|
| 22 |
+
path = Path(cache_path)
|
| 23 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 24 |
+
payload = {
|
| 25 |
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
| 26 |
+
"companies": sorted(set(companies)),
|
| 27 |
+
"jobs": [job.to_dict() for job in jobs],
|
| 28 |
+
}
|
| 29 |
+
path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
| 30 |
+
return path
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def merge_cached_jobs(cache_path: str | Path, refreshed_jobs: Sequence[JobPosting], refreshed_companies: Iterable[str]) -> Path:
|
| 34 |
+
cached = load_cached_jobs(cache_path)
|
| 35 |
+
refreshed_set = set(refreshed_companies)
|
| 36 |
+
retained_jobs = [job for job in cached.get("jobs", []) if job.get("company") not in refreshed_set]
|
| 37 |
+
refreshed_payload = [job.to_dict() for job in refreshed_jobs]
|
| 38 |
+
merged_companies = set(cached.get("companies", [])) - refreshed_set
|
| 39 |
+
merged_companies.update(refreshed_set)
|
| 40 |
+
|
| 41 |
+
payload = {
|
| 42 |
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
| 43 |
+
"companies": sorted(merged_companies),
|
| 44 |
+
"jobs": retained_jobs + refreshed_payload,
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
path = Path(cache_path)
|
| 48 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 49 |
+
path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
| 50 |
+
return path
|
src/collect_jobs.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Iterable, List, Optional
|
| 8 |
+
|
| 9 |
+
from src import cache
|
| 10 |
+
from src.collectors import generic, greenhouse, lever, smartrecruiters, workday
|
| 11 |
+
from src.collectors.common import dedupe_jobs
|
| 12 |
+
from src.company_loader import load_company_records
|
| 13 |
+
from src.detectors.ats_detector import detect_ats_type, extract_ats_identifier
|
| 14 |
+
from src.jobs.debug_utils import save_debug_html
|
| 15 |
+
from src.models import CompanyRecord, JobPosting
|
| 16 |
+
from src.resolver.jobs_page_resolver import ResolvedJobsPage, resolve_real_jobs_page
|
| 17 |
+
|
| 18 |
+
LOGGER = logging.getLogger("career_fair_matcher.collect_jobs")
|
| 19 |
+
|
| 20 |
+
CONNECTOR_REGISTRY = {
|
| 21 |
+
"greenhouse": greenhouse.collect,
|
| 22 |
+
"lever": lever.collect,
|
| 23 |
+
"workday": workday.collect,
|
| 24 |
+
"smartrecruiters": smartrecruiters.collect,
|
| 25 |
+
"generic": generic.collect,
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class CompanyCollectionOutcome:
|
| 31 |
+
company: str
|
| 32 |
+
original_url: str
|
| 33 |
+
resolved_url: str
|
| 34 |
+
fetch_method: str
|
| 35 |
+
ats_detected: str
|
| 36 |
+
connector_used: str
|
| 37 |
+
jobs_collected: int
|
| 38 |
+
failure_reason: str = ""
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def configure_logging(level: int = logging.INFO) -> None:
|
| 42 |
+
if LOGGER.handlers:
|
| 43 |
+
return
|
| 44 |
+
logging.basicConfig(level=level, format="%(message)s")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _log_company_event(outcome: CompanyCollectionOutcome) -> None:
|
| 48 |
+
LOGGER.info(json.dumps({
|
| 49 |
+
"company": outcome.company,
|
| 50 |
+
"original_careers_url": outcome.original_url,
|
| 51 |
+
"resolved_url": outcome.resolved_url,
|
| 52 |
+
"fetch_method": outcome.fetch_method,
|
| 53 |
+
"ats_detected": outcome.ats_detected,
|
| 54 |
+
"connector_used": outcome.connector_used,
|
| 55 |
+
"jobs_collected": outcome.jobs_collected,
|
| 56 |
+
"failure_reason": outcome.failure_reason,
|
| 57 |
+
}))
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _save_resolution_snapshots(debug_dir: Path, company: CompanyRecord, resolved_page: ResolvedJobsPage) -> None:
|
| 61 |
+
for stage, html in resolved_page.html_snapshots.items():
|
| 62 |
+
save_debug_html(company.company, html, stage, debug_dir)
|
| 63 |
+
save_debug_html(company.company, resolved_page.html, "resolved", debug_dir)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _collect_from_connector(company: CompanyRecord, resolved_page: ResolvedJobsPage, ats_type: str) -> tuple[List[JobPosting], str, str]:
|
| 67 |
+
connector_name = ats_type if ats_type in CONNECTOR_REGISTRY else "generic"
|
| 68 |
+
connector = CONNECTOR_REGISTRY[connector_name]
|
| 69 |
+
jobs = connector(company, resolved_page)
|
| 70 |
+
|
| 71 |
+
failure_reason = ""
|
| 72 |
+
if not jobs and connector_name != "generic":
|
| 73 |
+
jobs = generic.collect(company, resolved_page)
|
| 74 |
+
failure_reason = "ATS_PARSE_FAILED" if not jobs else ""
|
| 75 |
+
connector_name = "generic"
|
| 76 |
+
|
| 77 |
+
if not jobs:
|
| 78 |
+
resolution_failure = resolved_page.failure_reason or "NO_JOBS_FOUND"
|
| 79 |
+
return [], connector_name, resolution_failure
|
| 80 |
+
return dedupe_jobs(jobs), connector_name, failure_reason
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def collect_jobs_for_company(company: CompanyRecord, debug_dir: Path) -> tuple[List[JobPosting], CompanyCollectionOutcome]:
|
| 84 |
+
"""Resolve, detect, collect, normalize, and log a single curated company."""
|
| 85 |
+
resolved_page = resolve_real_jobs_page(company.careers_url)
|
| 86 |
+
_save_resolution_snapshots(debug_dir, company, resolved_page)
|
| 87 |
+
|
| 88 |
+
ats_type = detect_ats_type(resolved_page.url, resolved_page.html, company.ats_type)
|
| 89 |
+
if not company.ats_identifier:
|
| 90 |
+
company.ats_identifier = extract_ats_identifier(ats_type, resolved_page.url, resolved_page.html)
|
| 91 |
+
|
| 92 |
+
jobs, connector_used, failure_reason = _collect_from_connector(company, resolved_page, ats_type)
|
| 93 |
+
normalized_jobs = [
|
| 94 |
+
job if isinstance(job, JobPosting) else job
|
| 95 |
+
for job in jobs
|
| 96 |
+
]
|
| 97 |
+
outcome = CompanyCollectionOutcome(
|
| 98 |
+
company=company.company,
|
| 99 |
+
original_url=company.careers_url,
|
| 100 |
+
resolved_url=resolved_page.url,
|
| 101 |
+
fetch_method=resolved_page.fetch_method,
|
| 102 |
+
ats_detected=ats_type,
|
| 103 |
+
connector_used=connector_used,
|
| 104 |
+
jobs_collected=len(normalized_jobs),
|
| 105 |
+
failure_reason=failure_reason,
|
| 106 |
+
)
|
| 107 |
+
_log_company_event(outcome)
|
| 108 |
+
return normalized_jobs, outcome
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def refresh_selected_companies(
|
| 112 |
+
*,
|
| 113 |
+
company_csv: str | Path,
|
| 114 |
+
cache_path: str | Path,
|
| 115 |
+
debug_dir: str | Path,
|
| 116 |
+
selected_companies: Optional[Iterable[str]] = None,
|
| 117 |
+
limit: Optional[int] = None,
|
| 118 |
+
) -> tuple[List[JobPosting], List[CompanyCollectionOutcome], Path]:
|
| 119 |
+
"""Refresh cached jobs for the curated list, preserving partial results on failure."""
|
| 120 |
+
configure_logging()
|
| 121 |
+
debug_path = Path(debug_dir)
|
| 122 |
+
companies = load_company_records(company_csv, limit=limit, selected_companies=set(selected_companies or []))
|
| 123 |
+
|
| 124 |
+
all_jobs: List[JobPosting] = []
|
| 125 |
+
outcomes: List[CompanyCollectionOutcome] = []
|
| 126 |
+
|
| 127 |
+
for company in companies:
|
| 128 |
+
try:
|
| 129 |
+
company_jobs, outcome = collect_jobs_for_company(company, debug_path)
|
| 130 |
+
all_jobs.extend(company_jobs)
|
| 131 |
+
outcomes.append(outcome)
|
| 132 |
+
except Exception as exc:
|
| 133 |
+
outcome = CompanyCollectionOutcome(
|
| 134 |
+
company=company.company,
|
| 135 |
+
original_url=company.careers_url,
|
| 136 |
+
resolved_url=company.careers_url,
|
| 137 |
+
fetch_method="requests",
|
| 138 |
+
ats_detected=company.ats_type or "generic",
|
| 139 |
+
connector_used="generic",
|
| 140 |
+
jobs_collected=0,
|
| 141 |
+
failure_reason=str(exc) or "REQUEST_FAILED",
|
| 142 |
+
)
|
| 143 |
+
outcomes.append(outcome)
|
| 144 |
+
_log_company_event(outcome)
|
| 145 |
+
|
| 146 |
+
refreshed_companies = [company.company for company in companies]
|
| 147 |
+
cache_file = cache.merge_cached_jobs(cache_path, dedupe_jobs(all_jobs), refreshed_companies)
|
| 148 |
+
return dedupe_jobs(all_jobs), outcomes, cache_file
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def main() -> None:
|
| 152 |
+
base_dir = Path(__file__).resolve().parent.parent
|
| 153 |
+
company_csv = base_dir / "data" / "nsbe_companies.csv"
|
| 154 |
+
cache_path = base_dir / "data" / "cached_jobs.json"
|
| 155 |
+
debug_dir = base_dir / "debug_html" / "collect_jobs"
|
| 156 |
+
refresh_selected_companies(company_csv=company_csv, cache_path=cache_path, debug_dir=debug_dir)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
if __name__ == "__main__":
|
| 160 |
+
main()
|
src/collectors/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""ATS-specific and generic job collectors."""
|
src/collectors/common.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Iterable, List
|
| 4 |
+
|
| 5 |
+
from src.models import CompanyRecord, JobPosting
|
| 6 |
+
|
| 7 |
+
ENTRY_LEVEL_SIGNALS = [
|
| 8 |
+
"new grad",
|
| 9 |
+
"university",
|
| 10 |
+
"early career",
|
| 11 |
+
"associate",
|
| 12 |
+
"entry level",
|
| 13 |
+
"campus",
|
| 14 |
+
"intern",
|
| 15 |
+
"internship",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
SENIOR_SIGNALS = ["senior", "staff", "principal", "manager", "director"]
|
| 19 |
+
|
| 20 |
+
REMOTE_SIGNALS = ["remote", "hybrid remote", "work from home"]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def is_remote_role(*texts: str) -> bool:
|
| 24 |
+
blob = " ".join(texts).lower()
|
| 25 |
+
return any(signal in blob for signal in REMOTE_SIGNALS)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def is_entry_level_role(*texts: str) -> bool:
|
| 29 |
+
blob = " ".join(texts).lower()
|
| 30 |
+
if any(signal in blob for signal in SENIOR_SIGNALS):
|
| 31 |
+
return False
|
| 32 |
+
return any(signal in blob for signal in ENTRY_LEVEL_SIGNALS)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def normalize_job_posting(
|
| 36 |
+
company_record: CompanyRecord,
|
| 37 |
+
*,
|
| 38 |
+
title: str,
|
| 39 |
+
location: str,
|
| 40 |
+
job_url: str,
|
| 41 |
+
description: str,
|
| 42 |
+
department: str = "",
|
| 43 |
+
source_ats: str = "generic",
|
| 44 |
+
resolved_url: str = "",
|
| 45 |
+
employment_type: str = "",
|
| 46 |
+
posted_date: str = "",
|
| 47 |
+
raw_payload: dict | None = None,
|
| 48 |
+
) -> JobPosting:
|
| 49 |
+
"""Normalize connector output into the shared JobPosting schema."""
|
| 50 |
+
return JobPosting(
|
| 51 |
+
company=company_record.company,
|
| 52 |
+
title=title.strip()[:160],
|
| 53 |
+
location=location.strip()[:160],
|
| 54 |
+
url=job_url.strip(),
|
| 55 |
+
department=department.strip()[:160],
|
| 56 |
+
description=description.strip()[:4000],
|
| 57 |
+
ats=source_ats,
|
| 58 |
+
source_ats=source_ats,
|
| 59 |
+
resolved_url=resolved_url or company_record.careers_url,
|
| 60 |
+
employment_type=employment_type.strip()[:80],
|
| 61 |
+
posted_date=posted_date.strip()[:80],
|
| 62 |
+
is_remote=is_remote_role(title, location, description),
|
| 63 |
+
is_entry_level=is_entry_level_role(title, description),
|
| 64 |
+
raw_payload=raw_payload or {},
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def dedupe_jobs(jobs: Iterable[JobPosting]) -> List[JobPosting]:
|
| 69 |
+
"""Deduplicate jobs by normalized title and URL."""
|
| 70 |
+
seen = set()
|
| 71 |
+
deduped: List[JobPosting] = []
|
| 72 |
+
for job in jobs:
|
| 73 |
+
key = (job.title.strip().lower(), job.url.strip().lower())
|
| 74 |
+
if key in seen:
|
| 75 |
+
continue
|
| 76 |
+
seen.add(key)
|
| 77 |
+
deduped.append(job)
|
| 78 |
+
return deduped
|
src/collectors/generic.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
from src.collectors.common import dedupe_jobs, normalize_job_posting
|
| 7 |
+
from src.jobs.extractor import ExtractionDiagnostics, extract_jobs_with_diagnostics
|
| 8 |
+
from src.models import CompanyRecord, JobPosting
|
| 9 |
+
from src.resolver.jobs_page_resolver import ResolvedJobsPage
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class GenericCollectionDetails:
|
| 14 |
+
jobs: List[JobPosting] = field(default_factory=list)
|
| 15 |
+
failure_reason: str = "UNKNOWN"
|
| 16 |
+
diagnostics: ExtractionDiagnostics | None = None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
|
| 20 |
+
"""Generic fallback that parses resolved HTML and embedded JSON job data."""
|
| 21 |
+
return collect_with_details(company_record, resolved_page).jobs
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def collect_with_details(
|
| 25 |
+
company_record: CompanyRecord,
|
| 26 |
+
resolved_page: ResolvedJobsPage,
|
| 27 |
+
source_ats: str = "generic",
|
| 28 |
+
) -> GenericCollectionDetails:
|
| 29 |
+
parsed_jobs, diagnostics = extract_jobs_with_diagnostics(
|
| 30 |
+
company_record,
|
| 31 |
+
resolved_page.html,
|
| 32 |
+
source_ats,
|
| 33 |
+
base_url=resolved_page.url,
|
| 34 |
+
)
|
| 35 |
+
normalized = [
|
| 36 |
+
normalize_job_posting(
|
| 37 |
+
company_record,
|
| 38 |
+
title=job.title,
|
| 39 |
+
location=job.location,
|
| 40 |
+
job_url=job.url,
|
| 41 |
+
description=job.description,
|
| 42 |
+
department=job.department,
|
| 43 |
+
source_ats=source_ats,
|
| 44 |
+
resolved_url=resolved_page.url,
|
| 45 |
+
raw_payload=job.raw_payload,
|
| 46 |
+
)
|
| 47 |
+
for job in parsed_jobs
|
| 48 |
+
]
|
| 49 |
+
return GenericCollectionDetails(
|
| 50 |
+
jobs=dedupe_jobs(normalized),
|
| 51 |
+
failure_reason=diagnostics.failure_type,
|
| 52 |
+
diagnostics=diagnostics,
|
| 53 |
+
)
|
src/collectors/greenhouse.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
|
| 7 |
+
from src.collectors.common import dedupe_jobs, normalize_job_posting
|
| 8 |
+
from src.detectors.ats_detector import extract_ats_identifier
|
| 9 |
+
from src.models import CompanyRecord, JobPosting
|
| 10 |
+
from src.resolver.jobs_page_resolver import ResolvedJobsPage
|
| 11 |
+
|
| 12 |
+
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
|
| 16 |
+
"""Collect Greenhouse jobs via the public boards API."""
|
| 17 |
+
identifier = company_record.ats_identifier or extract_ats_identifier("greenhouse", resolved_page.url, resolved_page.html)
|
| 18 |
+
if not identifier:
|
| 19 |
+
return []
|
| 20 |
+
|
| 21 |
+
api_url = f"https://boards-api.greenhouse.io/v1/boards/{identifier}/jobs?content=true"
|
| 22 |
+
try:
|
| 23 |
+
response = requests.get(api_url, headers=HEADERS, timeout=20)
|
| 24 |
+
response.raise_for_status()
|
| 25 |
+
payload = response.json()
|
| 26 |
+
except Exception:
|
| 27 |
+
return []
|
| 28 |
+
|
| 29 |
+
jobs = [
|
| 30 |
+
normalize_job_posting(
|
| 31 |
+
company_record,
|
| 32 |
+
title=item.get("title", ""),
|
| 33 |
+
location=((item.get("location") or {}).get("name") or ""),
|
| 34 |
+
job_url=item.get("absolute_url", resolved_page.url),
|
| 35 |
+
description=(item.get("content") or ""),
|
| 36 |
+
department=(item.get("department") or ""),
|
| 37 |
+
source_ats="greenhouse",
|
| 38 |
+
resolved_url=resolved_page.url,
|
| 39 |
+
posted_date=str(item.get("updated_at") or ""),
|
| 40 |
+
raw_payload=item,
|
| 41 |
+
)
|
| 42 |
+
for item in payload.get("jobs", [])
|
| 43 |
+
if item.get("title")
|
| 44 |
+
]
|
| 45 |
+
return dedupe_jobs(jobs)
|
src/collectors/lever.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
|
| 7 |
+
from src.collectors.common import dedupe_jobs, normalize_job_posting
|
| 8 |
+
from src.detectors.ats_detector import extract_ats_identifier
|
| 9 |
+
from src.models import CompanyRecord, JobPosting
|
| 10 |
+
from src.resolver.jobs_page_resolver import ResolvedJobsPage
|
| 11 |
+
|
| 12 |
+
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
|
| 16 |
+
"""Collect Lever jobs via the public postings endpoint."""
|
| 17 |
+
identifier = company_record.ats_identifier or extract_ats_identifier("lever", resolved_page.url, resolved_page.html)
|
| 18 |
+
if not identifier:
|
| 19 |
+
return []
|
| 20 |
+
|
| 21 |
+
api_url = f"https://api.lever.co/v0/postings/{identifier}?mode=json"
|
| 22 |
+
try:
|
| 23 |
+
response = requests.get(api_url, headers=HEADERS, timeout=20)
|
| 24 |
+
response.raise_for_status()
|
| 25 |
+
payload = response.json()
|
| 26 |
+
except Exception:
|
| 27 |
+
return []
|
| 28 |
+
|
| 29 |
+
jobs = []
|
| 30 |
+
for item in payload:
|
| 31 |
+
categories = item.get("categories") or {}
|
| 32 |
+
jobs.append(
|
| 33 |
+
normalize_job_posting(
|
| 34 |
+
company_record,
|
| 35 |
+
title=item.get("text", ""),
|
| 36 |
+
location=categories.get("location", ""),
|
| 37 |
+
job_url=item.get("hostedUrl", resolved_page.url),
|
| 38 |
+
description=item.get("descriptionPlain", ""),
|
| 39 |
+
department=categories.get("team", ""),
|
| 40 |
+
source_ats="lever",
|
| 41 |
+
resolved_url=resolved_page.url,
|
| 42 |
+
employment_type=categories.get("commitment", ""),
|
| 43 |
+
raw_payload=item,
|
| 44 |
+
)
|
| 45 |
+
)
|
| 46 |
+
return dedupe_jobs(jobs)
|
src/collectors/smartrecruiters.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
|
| 7 |
+
from src.collectors.common import dedupe_jobs, normalize_job_posting
|
| 8 |
+
from src.detectors.ats_detector import extract_ats_identifier
|
| 9 |
+
from src.models import CompanyRecord, JobPosting
|
| 10 |
+
from src.resolver.jobs_page_resolver import ResolvedJobsPage
|
| 11 |
+
|
| 12 |
+
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
|
| 16 |
+
"""Collect SmartRecruiters jobs when a company identifier is available."""
|
| 17 |
+
identifier = company_record.ats_identifier or extract_ats_identifier("smartrecruiters", resolved_page.url, resolved_page.html)
|
| 18 |
+
if not identifier:
|
| 19 |
+
return []
|
| 20 |
+
|
| 21 |
+
api_url = f"https://api.smartrecruiters.com/v1/companies/{identifier}/postings"
|
| 22 |
+
try:
|
| 23 |
+
response = requests.get(api_url, headers=HEADERS, timeout=20)
|
| 24 |
+
response.raise_for_status()
|
| 25 |
+
payload = response.json()
|
| 26 |
+
except Exception:
|
| 27 |
+
return []
|
| 28 |
+
|
| 29 |
+
jobs = []
|
| 30 |
+
for item in payload.get("content", []):
|
| 31 |
+
jobs.append(
|
| 32 |
+
normalize_job_posting(
|
| 33 |
+
company_record,
|
| 34 |
+
title=item.get("name", ""),
|
| 35 |
+
location=(item.get("location") or {}).get("city", ""),
|
| 36 |
+
job_url=item.get("ref", resolved_page.url),
|
| 37 |
+
description=item.get("jobAd", {}).get("sections", "") if isinstance(item.get("jobAd"), dict) else "",
|
| 38 |
+
department=item.get("department", ""),
|
| 39 |
+
source_ats="smartrecruiters",
|
| 40 |
+
resolved_url=resolved_page.url,
|
| 41 |
+
employment_type=item.get("typeOfEmployment", ""),
|
| 42 |
+
posted_date=str(item.get("releasedDate") or ""),
|
| 43 |
+
raw_payload=item,
|
| 44 |
+
)
|
| 45 |
+
)
|
| 46 |
+
return dedupe_jobs(jobs)
|
src/collectors/workday.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
from src.collectors.common import dedupe_jobs
|
| 6 |
+
from src.collectors.generic import collect_with_details
|
| 7 |
+
from src.models import CompanyRecord, JobPosting
|
| 8 |
+
from src.resolver.jobs_page_resolver import ResolvedJobsPage
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
|
| 12 |
+
"""Best-effort Workday collection.
|
| 13 |
+
|
| 14 |
+
Workday endpoints vary widely by tenant, so this connector currently relies on the
|
| 15 |
+
resolved HTML plus embedded JSON extraction until a tenant-specific endpoint is configured.
|
| 16 |
+
"""
|
| 17 |
+
details = collect_with_details(company_record, resolved_page, source_ats="workday")
|
| 18 |
+
return dedupe_jobs(details.jobs)
|
src/company_loader.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import csv
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Iterable, List, Optional, Set
|
| 6 |
+
|
| 7 |
+
from src.models import CompanyRecord
|
| 8 |
+
|
| 9 |
+
COMPANY_KEYS = ["company", "company list", "name"]
|
| 10 |
+
CAREERS_KEYS = ["careers_url", "career url", "jobs_url", "direct links to company career/job openings page"]
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _pick_value(row: dict[str, str], keys: Iterable[str]) -> str:
|
| 14 |
+
for key in keys:
|
| 15 |
+
value = row.get(key, "").strip()
|
| 16 |
+
if value:
|
| 17 |
+
return value
|
| 18 |
+
return ""
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _parse_priority(value: str) -> int:
|
| 22 |
+
try:
|
| 23 |
+
return int(value.strip()) if value.strip() else 0
|
| 24 |
+
except Exception:
|
| 25 |
+
return 0
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def load_company_records(
|
| 29 |
+
csv_path: str | Path,
|
| 30 |
+
*,
|
| 31 |
+
limit: Optional[int] = None,
|
| 32 |
+
selected_companies: Optional[Set[str]] = None,
|
| 33 |
+
) -> List[CompanyRecord]:
|
| 34 |
+
"""Load the curated company list used by the targeted job collector."""
|
| 35 |
+
path = Path(csv_path)
|
| 36 |
+
if not path.exists():
|
| 37 |
+
raise FileNotFoundError(f"Company CSV not found: {path}")
|
| 38 |
+
|
| 39 |
+
companies: List[CompanyRecord] = []
|
| 40 |
+
with path.open("r", encoding="utf-8-sig", newline="") as handle:
|
| 41 |
+
reader = csv.DictReader(handle)
|
| 42 |
+
for raw_row in reader:
|
| 43 |
+
row = {str(key).strip().lower(): str(value or "").strip() for key, value in raw_row.items() if key}
|
| 44 |
+
company = _pick_value(row, COMPANY_KEYS)
|
| 45 |
+
careers_url = _pick_value(row, CAREERS_KEYS)
|
| 46 |
+
if not company or not careers_url:
|
| 47 |
+
continue
|
| 48 |
+
if selected_companies and company not in selected_companies:
|
| 49 |
+
continue
|
| 50 |
+
|
| 51 |
+
companies.append(
|
| 52 |
+
CompanyRecord(
|
| 53 |
+
company=company,
|
| 54 |
+
careers_url=careers_url,
|
| 55 |
+
ats_type=row.get("ats_type", ""),
|
| 56 |
+
ats_identifier=row.get("ats_identifier", ""),
|
| 57 |
+
priority=_parse_priority(row.get("priority", "0")),
|
| 58 |
+
source="curated",
|
| 59 |
+
meta=row,
|
| 60 |
+
)
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
return companies[:limit] if limit else companies
|
src/detectors/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""ATS detection helpers."""
|
src/detectors/ats_detector.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from urllib.parse import parse_qs, urlparse
|
| 5 |
+
|
| 6 |
+
KNOWN_ATS = {
|
| 7 |
+
"greenhouse": ["greenhouse.io"],
|
| 8 |
+
"lever": ["lever.co", "jobs.lever.co"],
|
| 9 |
+
"workday": ["myworkdayjobs.com", "workday.com"],
|
| 10 |
+
"smartrecruiters": ["smartrecruiters.com"],
|
| 11 |
+
"icims": ["icims.com"],
|
| 12 |
+
"ashby": ["ashbyhq.com", "jobs.ashbyhq.com"],
|
| 13 |
+
"successfactors": ["successfactors.com", "career8.successfactors.com"],
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def normalize_ats_type(value: str) -> str:
|
| 18 |
+
lowered = (value or "").strip().lower()
|
| 19 |
+
if lowered in KNOWN_ATS:
|
| 20 |
+
return lowered
|
| 21 |
+
return "generic"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def detect_ats_type(url: str, html: str = "", declared_ats: str = "") -> str:
|
| 25 |
+
"""Detect the ATS provider from explicit config, resolved URL, or HTML content."""
|
| 26 |
+
if declared_ats:
|
| 27 |
+
return normalize_ats_type(declared_ats)
|
| 28 |
+
|
| 29 |
+
blob = f"{url} {html}".lower()
|
| 30 |
+
for ats_type, patterns in KNOWN_ATS.items():
|
| 31 |
+
if any(pattern in blob for pattern in patterns):
|
| 32 |
+
return ats_type
|
| 33 |
+
|
| 34 |
+
return "generic"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def extract_ats_identifier(ats_type: str, url: str, html: str = "") -> str:
|
| 38 |
+
"""Best-effort extraction of ATS board identifiers for connector API usage."""
|
| 39 |
+
patterns = {
|
| 40 |
+
"greenhouse": r"greenhouse\.io/([^/?#]+)",
|
| 41 |
+
"lever": r"lever\.co/([^/?#]+)",
|
| 42 |
+
"smartrecruiters": r"smartrecruiters\.com/([^/?#]+)",
|
| 43 |
+
"ashby": r"ashbyhq\.com/([^/?#]+)",
|
| 44 |
+
}
|
| 45 |
+
if ats_type in patterns:
|
| 46 |
+
match = re.search(patterns[ats_type], f"{url} {html}")
|
| 47 |
+
return match.group(1) if match else ""
|
| 48 |
+
|
| 49 |
+
if ats_type == "successfactors":
|
| 50 |
+
parsed = urlparse(url)
|
| 51 |
+
query = parse_qs(parsed.query)
|
| 52 |
+
values = query.get("company") or query.get("_s.crb") or []
|
| 53 |
+
return values[0] if values else ""
|
| 54 |
+
|
| 55 |
+
if ats_type == "workday":
|
| 56 |
+
parsed = urlparse(url)
|
| 57 |
+
parts = [part for part in parsed.path.split("/") if part]
|
| 58 |
+
return "/".join(parts[:3]) if parts else ""
|
| 59 |
+
|
| 60 |
+
return ""
|
src/models.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
from dataclasses import dataclass, field
|
| 2 |
-
from typing import Dict, List
|
| 3 |
|
| 4 |
|
| 5 |
@dataclass
|
|
@@ -19,6 +19,9 @@ class CompanyRecord:
|
|
| 19 |
careers_url: str = ""
|
| 20 |
source: str = "default"
|
| 21 |
meta: Dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
@dataclass
|
|
@@ -30,6 +33,20 @@ class JobPosting:
|
|
| 30 |
department: str = ""
|
| 31 |
description: str = ""
|
| 32 |
ats: str = "unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
@dataclass
|
|
|
|
| 1 |
+
from dataclasses import asdict, dataclass, field
|
| 2 |
+
from typing import Any, Dict, List
|
| 3 |
|
| 4 |
|
| 5 |
@dataclass
|
|
|
|
| 19 |
careers_url: str = ""
|
| 20 |
source: str = "default"
|
| 21 |
meta: Dict[str, str] = field(default_factory=dict)
|
| 22 |
+
ats_type: str = ""
|
| 23 |
+
ats_identifier: str = ""
|
| 24 |
+
priority: int = 0
|
| 25 |
|
| 26 |
|
| 27 |
@dataclass
|
|
|
|
| 33 |
department: str = ""
|
| 34 |
description: str = ""
|
| 35 |
ats: str = "unknown"
|
| 36 |
+
source_ats: str = "unknown"
|
| 37 |
+
resolved_url: str = ""
|
| 38 |
+
employment_type: str = ""
|
| 39 |
+
posted_date: str = ""
|
| 40 |
+
is_remote: bool = False
|
| 41 |
+
is_entry_level: bool = False
|
| 42 |
+
failure_reason: str = ""
|
| 43 |
+
raw_payload: Dict[str, Any] = field(default_factory=dict)
|
| 44 |
+
|
| 45 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 46 |
+
payload = asdict(self)
|
| 47 |
+
payload["job_url"] = payload.pop("url")
|
| 48 |
+
payload["source_ats"] = payload.get("source_ats") or payload.get("ats", "unknown")
|
| 49 |
+
return payload
|
| 50 |
|
| 51 |
|
| 52 |
@dataclass
|
src/resolver/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Jobs page resolution utilities."""
|
src/resolver/jobs_page_resolver.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import importlib
|
| 4 |
+
import re
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Dict, List
|
| 7 |
+
from urllib.parse import urljoin
|
| 8 |
+
|
| 9 |
+
import requests
|
| 10 |
+
from bs4 import BeautifulSoup
|
| 11 |
+
|
| 12 |
+
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
| 13 |
+
ATS_LINK_PATTERNS = [
|
| 14 |
+
"greenhouse.io",
|
| 15 |
+
"jobs.lever.co",
|
| 16 |
+
"myworkdayjobs.com",
|
| 17 |
+
"smartrecruiters.com",
|
| 18 |
+
"icims.com",
|
| 19 |
+
"ashbyhq.com",
|
| 20 |
+
"successfactors.com",
|
| 21 |
+
]
|
| 22 |
+
JOB_CTA_HINTS = [
|
| 23 |
+
"jobs",
|
| 24 |
+
"job search",
|
| 25 |
+
"search jobs",
|
| 26 |
+
"view all jobs",
|
| 27 |
+
"open positions",
|
| 28 |
+
"current openings",
|
| 29 |
+
"careers search",
|
| 30 |
+
"all openings",
|
| 31 |
+
"explore jobs",
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class ResolvedJobsPage:
|
| 37 |
+
requested_url: str
|
| 38 |
+
url: str
|
| 39 |
+
html: str
|
| 40 |
+
fetch_method: str
|
| 41 |
+
final_url: str = ""
|
| 42 |
+
fallback_used: bool = False
|
| 43 |
+
failure_reason: str = ""
|
| 44 |
+
resolution_steps: List[str] = field(default_factory=list)
|
| 45 |
+
html_snapshots: Dict[str, str] = field(default_factory=dict)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def find_ats_link(html: str, base_url: str) -> str:
|
| 49 |
+
"""Return the first ATS-hosted link or iframe source found in the page."""
|
| 50 |
+
soup = BeautifulSoup(html or "", "html.parser")
|
| 51 |
+
for tag in soup.select("a[href], iframe[src]"):
|
| 52 |
+
target = tag.get("href") or tag.get("src") or ""
|
| 53 |
+
resolved = urljoin(base_url, target.strip())
|
| 54 |
+
if any(pattern in resolved.lower() for pattern in ATS_LINK_PATTERNS):
|
| 55 |
+
return resolved
|
| 56 |
+
return ""
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def find_redirect_url(html: str, base_url: str) -> str:
|
| 60 |
+
"""Handle meta refresh and simple JavaScript redirects."""
|
| 61 |
+
soup = BeautifulSoup(html or "", "html.parser")
|
| 62 |
+
meta_refresh = soup.find("meta", attrs={"http-equiv": re.compile(r"refresh", re.I)})
|
| 63 |
+
if meta_refresh:
|
| 64 |
+
content = meta_refresh.get("content", "")
|
| 65 |
+
match = re.search(r"url\s*=\s*([^;]+)", content, re.I)
|
| 66 |
+
if match:
|
| 67 |
+
return urljoin(base_url, match.group(1).strip().strip("\"'"))
|
| 68 |
+
|
| 69 |
+
for pattern in [
|
| 70 |
+
r'location\.href\s*=\s*["\']([^"\']+)["\']',
|
| 71 |
+
r'window\.location\s*=\s*["\']([^"\']+)["\']',
|
| 72 |
+
r'window\.location\.href\s*=\s*["\']([^"\']+)["\']',
|
| 73 |
+
]:
|
| 74 |
+
match = re.search(pattern, html or "", re.I)
|
| 75 |
+
if match:
|
| 76 |
+
return urljoin(base_url, match.group(1).strip())
|
| 77 |
+
return ""
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def find_job_list_url(html: str, base_url: str) -> str:
|
| 81 |
+
"""Find a likely jobs listing URL from page CTAs, anchors, buttons, or iframes."""
|
| 82 |
+
soup = BeautifulSoup(html or "", "html.parser")
|
| 83 |
+
for tag in soup.select("a[href], button, iframe[src], [data-href], [formaction]"):
|
| 84 |
+
text = " ".join(tag.get_text(" ", strip=True).split()).lower()
|
| 85 |
+
target = tag.get("href") or tag.get("src") or tag.get("data-href") or tag.get("formaction") or ""
|
| 86 |
+
blob = f"{text} {target}".lower()
|
| 87 |
+
if any(hint in blob for hint in JOB_CTA_HINTS):
|
| 88 |
+
return urljoin(base_url, target.strip()) if target else ""
|
| 89 |
+
return ""
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def looks_like_shell_page(html: str) -> bool:
|
| 93 |
+
"""Detect shell/search pages that do not yet expose job content."""
|
| 94 |
+
lowered = (html or "").lower()
|
| 95 |
+
if not lowered:
|
| 96 |
+
return True
|
| 97 |
+
shell_markers = ["search jobs", "view jobs", "career search", "keyword", "join our talent community"]
|
| 98 |
+
job_markers = ["apply now", "job id", "req id", "posted", "department"]
|
| 99 |
+
return sum(marker in lowered for marker in shell_markers) >= 2 and sum(marker in lowered for marker in job_markers) <= 1
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def looks_js_heavy(html: str) -> bool:
|
| 103 |
+
"""Detect pages that likely require browser execution."""
|
| 104 |
+
lowered = (html or "").lower()
|
| 105 |
+
if len(lowered) < 1500:
|
| 106 |
+
return True
|
| 107 |
+
return any(marker in lowered for marker in ["enable javascript", "loading", "__next", "app-root", "hydration"])
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _request_page(url: str, timeout: int = 12) -> ResolvedJobsPage:
|
| 111 |
+
response = requests.get(url, headers=HEADERS, timeout=timeout)
|
| 112 |
+
response.raise_for_status()
|
| 113 |
+
resolved_url = response.url or url
|
| 114 |
+
return ResolvedJobsPage(
|
| 115 |
+
requested_url=url,
|
| 116 |
+
url=resolved_url,
|
| 117 |
+
final_url=resolved_url,
|
| 118 |
+
html=response.text,
|
| 119 |
+
fetch_method="requests",
|
| 120 |
+
resolution_steps=[f"requests:{url} -> {resolved_url}"],
|
| 121 |
+
html_snapshots={"requests": response.text},
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _playwright_resolve(url: str) -> ResolvedJobsPage:
|
| 126 |
+
try:
|
| 127 |
+
sync_api = importlib.import_module("playwright.sync_api")
|
| 128 |
+
sync_playwright = getattr(sync_api, "sync_playwright")
|
| 129 |
+
|
| 130 |
+
with sync_playwright() as playwright:
|
| 131 |
+
browser = playwright.chromium.launch(headless=True)
|
| 132 |
+
page = browser.new_page()
|
| 133 |
+
page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
page.wait_for_load_state("networkidle", timeout=5000)
|
| 137 |
+
except Exception:
|
| 138 |
+
pass
|
| 139 |
+
|
| 140 |
+
for cta in ["View Jobs", "Search Jobs", "Open Positions", "Current Openings", "Explore Jobs", "See All Jobs"]:
|
| 141 |
+
try:
|
| 142 |
+
page.get_by_text(cta, exact=False).first.click(timeout=1500)
|
| 143 |
+
page.wait_for_timeout(800)
|
| 144 |
+
break
|
| 145 |
+
except Exception:
|
| 146 |
+
continue
|
| 147 |
+
|
| 148 |
+
for _ in range(3):
|
| 149 |
+
page.mouse.wheel(0, 1800)
|
| 150 |
+
page.wait_for_timeout(400)
|
| 151 |
+
|
| 152 |
+
html = page.content()
|
| 153 |
+
final_url = page.url
|
| 154 |
+
browser.close()
|
| 155 |
+
except Exception:
|
| 156 |
+
return ResolvedJobsPage(requested_url=url, url=url, final_url=url, html="", fetch_method="playwright", fallback_used=True, failure_reason="REQUEST_FAILED")
|
| 157 |
+
|
| 158 |
+
return ResolvedJobsPage(
|
| 159 |
+
requested_url=url,
|
| 160 |
+
url=final_url,
|
| 161 |
+
final_url=final_url,
|
| 162 |
+
html=html,
|
| 163 |
+
fetch_method="playwright",
|
| 164 |
+
fallback_used=True,
|
| 165 |
+
resolution_steps=[f"playwright:{url} -> {final_url}"],
|
| 166 |
+
html_snapshots={"playwright": html},
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def resolve_real_jobs_page(careers_url: str) -> ResolvedJobsPage:
|
| 171 |
+
"""Resolve the company careers landing page to the real jobs page before collection."""
|
| 172 |
+
if not careers_url:
|
| 173 |
+
return ResolvedJobsPage(requested_url="", url="", final_url="", html="", fetch_method="none", failure_reason="REQUEST_FAILED")
|
| 174 |
+
|
| 175 |
+
visited = set()
|
| 176 |
+
current_url = careers_url
|
| 177 |
+
steps: List[str] = []
|
| 178 |
+
snapshots: Dict[str, str] = {}
|
| 179 |
+
last_page = ResolvedJobsPage(requested_url=careers_url, url=careers_url, final_url=careers_url, html="", fetch_method="requests")
|
| 180 |
+
|
| 181 |
+
for step in range(4):
|
| 182 |
+
if current_url in visited:
|
| 183 |
+
break
|
| 184 |
+
visited.add(current_url)
|
| 185 |
+
|
| 186 |
+
try:
|
| 187 |
+
last_page = _request_page(current_url)
|
| 188 |
+
steps.extend(last_page.resolution_steps)
|
| 189 |
+
snapshots[f"step{step + 1}_requests"] = last_page.html
|
| 190 |
+
except Exception:
|
| 191 |
+
fallback = _playwright_resolve(current_url)
|
| 192 |
+
fallback.failure_reason = fallback.failure_reason or "REQUEST_FAILED"
|
| 193 |
+
fallback.resolution_steps = steps + fallback.resolution_steps
|
| 194 |
+
fallback.html_snapshots.update(snapshots)
|
| 195 |
+
return fallback
|
| 196 |
+
|
| 197 |
+
next_url = find_redirect_url(last_page.html, last_page.url) or find_ats_link(last_page.html, last_page.url) or find_job_list_url(last_page.html, last_page.url)
|
| 198 |
+
if next_url and next_url not in visited:
|
| 199 |
+
steps.append(f"discovered:{last_page.url} -> {next_url}")
|
| 200 |
+
current_url = next_url
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
if looks_like_shell_page(last_page.html) or looks_js_heavy(last_page.html):
|
| 204 |
+
fallback = _playwright_resolve(last_page.url)
|
| 205 |
+
fallback.failure_reason = fallback.failure_reason or "JS_PAGE"
|
| 206 |
+
fallback.resolution_steps = steps + fallback.resolution_steps
|
| 207 |
+
fallback.html_snapshots.update(snapshots)
|
| 208 |
+
return fallback
|
| 209 |
+
|
| 210 |
+
break
|
| 211 |
+
|
| 212 |
+
last_page.requested_url = careers_url
|
| 213 |
+
last_page.resolution_steps = steps
|
| 214 |
+
last_page.html_snapshots.update(snapshots)
|
| 215 |
+
if looks_like_shell_page(last_page.html):
|
| 216 |
+
last_page.failure_reason = "SHELL_PAGE"
|
| 217 |
+
return last_page
|