acarey5 commited on
Commit
851ce09
·
1 Parent(s): b8d2d77

new scrapping

Browse files
data/cached_jobs.json ADDED
The diff for this file is too large to render. See raw diff
 
data/nsbe_companies.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ company,careers_url,ats_type,ats_identifier,priority
2
+ Stripe,https://stripe.com/jobs/search,greenhouse,stripe,10
3
+ Netflix,https://jobs.netflix.com/,lever,netflix,10
4
+ Capital One,https://www.capitalonecareers.com/,workday,,9
5
+ Chevron,https://chevron.wd5.myworkdayjobs.com/ChevronCareers,workday,,8
6
+ ServiceNow,https://careers.servicenow.com/careers,smartrecruiters,,8
7
+ ABB,https://global.abb/group/en/careers,generic,,7
src/cache.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+ from typing import Iterable, List, Sequence
7
+
8
+ from src.models import JobPosting
9
+
10
+
11
+ def load_cached_jobs(cache_path: str | Path) -> dict:
12
+ path = Path(cache_path)
13
+ if not path.exists():
14
+ return {"generated_at": "", "companies": [], "jobs": []}
15
+ try:
16
+ return json.loads(path.read_text(encoding="utf-8"))
17
+ except Exception:
18
+ return {"generated_at": "", "companies": [], "jobs": []}
19
+
20
+
21
+ def write_cached_jobs(cache_path: str | Path, jobs: Sequence[JobPosting], companies: Iterable[str]) -> Path:
22
+ path = Path(cache_path)
23
+ path.parent.mkdir(parents=True, exist_ok=True)
24
+ payload = {
25
+ "generated_at": datetime.now(timezone.utc).isoformat(),
26
+ "companies": sorted(set(companies)),
27
+ "jobs": [job.to_dict() for job in jobs],
28
+ }
29
+ path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
30
+ return path
31
+
32
+
33
+ def merge_cached_jobs(cache_path: str | Path, refreshed_jobs: Sequence[JobPosting], refreshed_companies: Iterable[str]) -> Path:
34
+ cached = load_cached_jobs(cache_path)
35
+ refreshed_set = set(refreshed_companies)
36
+ retained_jobs = [job for job in cached.get("jobs", []) if job.get("company") not in refreshed_set]
37
+ refreshed_payload = [job.to_dict() for job in refreshed_jobs]
38
+ merged_companies = set(cached.get("companies", [])) - refreshed_set
39
+ merged_companies.update(refreshed_set)
40
+
41
+ payload = {
42
+ "generated_at": datetime.now(timezone.utc).isoformat(),
43
+ "companies": sorted(merged_companies),
44
+ "jobs": retained_jobs + refreshed_payload,
45
+ }
46
+
47
+ path = Path(cache_path)
48
+ path.parent.mkdir(parents=True, exist_ok=True)
49
+ path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
50
+ return path
src/collect_jobs.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Iterable, List, Optional
8
+
9
+ from src import cache
10
+ from src.collectors import generic, greenhouse, lever, smartrecruiters, workday
11
+ from src.collectors.common import dedupe_jobs
12
+ from src.company_loader import load_company_records
13
+ from src.detectors.ats_detector import detect_ats_type, extract_ats_identifier
14
+ from src.jobs.debug_utils import save_debug_html
15
+ from src.models import CompanyRecord, JobPosting
16
+ from src.resolver.jobs_page_resolver import ResolvedJobsPage, resolve_real_jobs_page
17
+
18
+ LOGGER = logging.getLogger("career_fair_matcher.collect_jobs")
19
+
20
+ CONNECTOR_REGISTRY = {
21
+ "greenhouse": greenhouse.collect,
22
+ "lever": lever.collect,
23
+ "workday": workday.collect,
24
+ "smartrecruiters": smartrecruiters.collect,
25
+ "generic": generic.collect,
26
+ }
27
+
28
+
29
+ @dataclass
30
+ class CompanyCollectionOutcome:
31
+ company: str
32
+ original_url: str
33
+ resolved_url: str
34
+ fetch_method: str
35
+ ats_detected: str
36
+ connector_used: str
37
+ jobs_collected: int
38
+ failure_reason: str = ""
39
+
40
+
41
+ def configure_logging(level: int = logging.INFO) -> None:
42
+ if LOGGER.handlers:
43
+ return
44
+ logging.basicConfig(level=level, format="%(message)s")
45
+
46
+
47
+ def _log_company_event(outcome: CompanyCollectionOutcome) -> None:
48
+ LOGGER.info(json.dumps({
49
+ "company": outcome.company,
50
+ "original_careers_url": outcome.original_url,
51
+ "resolved_url": outcome.resolved_url,
52
+ "fetch_method": outcome.fetch_method,
53
+ "ats_detected": outcome.ats_detected,
54
+ "connector_used": outcome.connector_used,
55
+ "jobs_collected": outcome.jobs_collected,
56
+ "failure_reason": outcome.failure_reason,
57
+ }))
58
+
59
+
60
+ def _save_resolution_snapshots(debug_dir: Path, company: CompanyRecord, resolved_page: ResolvedJobsPage) -> None:
61
+ for stage, html in resolved_page.html_snapshots.items():
62
+ save_debug_html(company.company, html, stage, debug_dir)
63
+ save_debug_html(company.company, resolved_page.html, "resolved", debug_dir)
64
+
65
+
66
+ def _collect_from_connector(company: CompanyRecord, resolved_page: ResolvedJobsPage, ats_type: str) -> tuple[List[JobPosting], str, str]:
67
+ connector_name = ats_type if ats_type in CONNECTOR_REGISTRY else "generic"
68
+ connector = CONNECTOR_REGISTRY[connector_name]
69
+ jobs = connector(company, resolved_page)
70
+
71
+ failure_reason = ""
72
+ if not jobs and connector_name != "generic":
73
+ jobs = generic.collect(company, resolved_page)
74
+ failure_reason = "ATS_PARSE_FAILED" if not jobs else ""
75
+ connector_name = "generic"
76
+
77
+ if not jobs:
78
+ resolution_failure = resolved_page.failure_reason or "NO_JOBS_FOUND"
79
+ return [], connector_name, resolution_failure
80
+ return dedupe_jobs(jobs), connector_name, failure_reason
81
+
82
+
83
+ def collect_jobs_for_company(company: CompanyRecord, debug_dir: Path) -> tuple[List[JobPosting], CompanyCollectionOutcome]:
84
+ """Resolve, detect, collect, normalize, and log a single curated company."""
85
+ resolved_page = resolve_real_jobs_page(company.careers_url)
86
+ _save_resolution_snapshots(debug_dir, company, resolved_page)
87
+
88
+ ats_type = detect_ats_type(resolved_page.url, resolved_page.html, company.ats_type)
89
+ if not company.ats_identifier:
90
+ company.ats_identifier = extract_ats_identifier(ats_type, resolved_page.url, resolved_page.html)
91
+
92
+ jobs, connector_used, failure_reason = _collect_from_connector(company, resolved_page, ats_type)
93
+ normalized_jobs = [
94
+ job if isinstance(job, JobPosting) else job
95
+ for job in jobs
96
+ ]
97
+ outcome = CompanyCollectionOutcome(
98
+ company=company.company,
99
+ original_url=company.careers_url,
100
+ resolved_url=resolved_page.url,
101
+ fetch_method=resolved_page.fetch_method,
102
+ ats_detected=ats_type,
103
+ connector_used=connector_used,
104
+ jobs_collected=len(normalized_jobs),
105
+ failure_reason=failure_reason,
106
+ )
107
+ _log_company_event(outcome)
108
+ return normalized_jobs, outcome
109
+
110
+
111
+ def refresh_selected_companies(
112
+ *,
113
+ company_csv: str | Path,
114
+ cache_path: str | Path,
115
+ debug_dir: str | Path,
116
+ selected_companies: Optional[Iterable[str]] = None,
117
+ limit: Optional[int] = None,
118
+ ) -> tuple[List[JobPosting], List[CompanyCollectionOutcome], Path]:
119
+ """Refresh cached jobs for the curated list, preserving partial results on failure."""
120
+ configure_logging()
121
+ debug_path = Path(debug_dir)
122
+ companies = load_company_records(company_csv, limit=limit, selected_companies=set(selected_companies or []))
123
+
124
+ all_jobs: List[JobPosting] = []
125
+ outcomes: List[CompanyCollectionOutcome] = []
126
+
127
+ for company in companies:
128
+ try:
129
+ company_jobs, outcome = collect_jobs_for_company(company, debug_path)
130
+ all_jobs.extend(company_jobs)
131
+ outcomes.append(outcome)
132
+ except Exception as exc:
133
+ outcome = CompanyCollectionOutcome(
134
+ company=company.company,
135
+ original_url=company.careers_url,
136
+ resolved_url=company.careers_url,
137
+ fetch_method="requests",
138
+ ats_detected=company.ats_type or "generic",
139
+ connector_used="generic",
140
+ jobs_collected=0,
141
+ failure_reason=str(exc) or "REQUEST_FAILED",
142
+ )
143
+ outcomes.append(outcome)
144
+ _log_company_event(outcome)
145
+
146
+ refreshed_companies = [company.company for company in companies]
147
+ cache_file = cache.merge_cached_jobs(cache_path, dedupe_jobs(all_jobs), refreshed_companies)
148
+ return dedupe_jobs(all_jobs), outcomes, cache_file
149
+
150
+
151
+ def main() -> None:
152
+ base_dir = Path(__file__).resolve().parent.parent
153
+ company_csv = base_dir / "data" / "nsbe_companies.csv"
154
+ cache_path = base_dir / "data" / "cached_jobs.json"
155
+ debug_dir = base_dir / "debug_html" / "collect_jobs"
156
+ refresh_selected_companies(company_csv=company_csv, cache_path=cache_path, debug_dir=debug_dir)
157
+
158
+
159
+ if __name__ == "__main__":
160
+ main()
src/collectors/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """ATS-specific and generic job collectors."""
src/collectors/common.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, List
4
+
5
+ from src.models import CompanyRecord, JobPosting
6
+
7
+ ENTRY_LEVEL_SIGNALS = [
8
+ "new grad",
9
+ "university",
10
+ "early career",
11
+ "associate",
12
+ "entry level",
13
+ "campus",
14
+ "intern",
15
+ "internship",
16
+ ]
17
+
18
+ SENIOR_SIGNALS = ["senior", "staff", "principal", "manager", "director"]
19
+
20
+ REMOTE_SIGNALS = ["remote", "hybrid remote", "work from home"]
21
+
22
+
23
+ def is_remote_role(*texts: str) -> bool:
24
+ blob = " ".join(texts).lower()
25
+ return any(signal in blob for signal in REMOTE_SIGNALS)
26
+
27
+
28
+ def is_entry_level_role(*texts: str) -> bool:
29
+ blob = " ".join(texts).lower()
30
+ if any(signal in blob for signal in SENIOR_SIGNALS):
31
+ return False
32
+ return any(signal in blob for signal in ENTRY_LEVEL_SIGNALS)
33
+
34
+
35
+ def normalize_job_posting(
36
+ company_record: CompanyRecord,
37
+ *,
38
+ title: str,
39
+ location: str,
40
+ job_url: str,
41
+ description: str,
42
+ department: str = "",
43
+ source_ats: str = "generic",
44
+ resolved_url: str = "",
45
+ employment_type: str = "",
46
+ posted_date: str = "",
47
+ raw_payload: dict | None = None,
48
+ ) -> JobPosting:
49
+ """Normalize connector output into the shared JobPosting schema."""
50
+ return JobPosting(
51
+ company=company_record.company,
52
+ title=title.strip()[:160],
53
+ location=location.strip()[:160],
54
+ url=job_url.strip(),
55
+ department=department.strip()[:160],
56
+ description=description.strip()[:4000],
57
+ ats=source_ats,
58
+ source_ats=source_ats,
59
+ resolved_url=resolved_url or company_record.careers_url,
60
+ employment_type=employment_type.strip()[:80],
61
+ posted_date=posted_date.strip()[:80],
62
+ is_remote=is_remote_role(title, location, description),
63
+ is_entry_level=is_entry_level_role(title, description),
64
+ raw_payload=raw_payload or {},
65
+ )
66
+
67
+
68
+ def dedupe_jobs(jobs: Iterable[JobPosting]) -> List[JobPosting]:
69
+ """Deduplicate jobs by normalized title and URL."""
70
+ seen = set()
71
+ deduped: List[JobPosting] = []
72
+ for job in jobs:
73
+ key = (job.title.strip().lower(), job.url.strip().lower())
74
+ if key in seen:
75
+ continue
76
+ seen.add(key)
77
+ deduped.append(job)
78
+ return deduped
src/collectors/generic.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import List
5
+
6
+ from src.collectors.common import dedupe_jobs, normalize_job_posting
7
+ from src.jobs.extractor import ExtractionDiagnostics, extract_jobs_with_diagnostics
8
+ from src.models import CompanyRecord, JobPosting
9
+ from src.resolver.jobs_page_resolver import ResolvedJobsPage
10
+
11
+
12
+ @dataclass
13
+ class GenericCollectionDetails:
14
+ jobs: List[JobPosting] = field(default_factory=list)
15
+ failure_reason: str = "UNKNOWN"
16
+ diagnostics: ExtractionDiagnostics | None = None
17
+
18
+
19
+ def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
20
+ """Generic fallback that parses resolved HTML and embedded JSON job data."""
21
+ return collect_with_details(company_record, resolved_page).jobs
22
+
23
+
24
+ def collect_with_details(
25
+ company_record: CompanyRecord,
26
+ resolved_page: ResolvedJobsPage,
27
+ source_ats: str = "generic",
28
+ ) -> GenericCollectionDetails:
29
+ parsed_jobs, diagnostics = extract_jobs_with_diagnostics(
30
+ company_record,
31
+ resolved_page.html,
32
+ source_ats,
33
+ base_url=resolved_page.url,
34
+ )
35
+ normalized = [
36
+ normalize_job_posting(
37
+ company_record,
38
+ title=job.title,
39
+ location=job.location,
40
+ job_url=job.url,
41
+ description=job.description,
42
+ department=job.department,
43
+ source_ats=source_ats,
44
+ resolved_url=resolved_page.url,
45
+ raw_payload=job.raw_payload,
46
+ )
47
+ for job in parsed_jobs
48
+ ]
49
+ return GenericCollectionDetails(
50
+ jobs=dedupe_jobs(normalized),
51
+ failure_reason=diagnostics.failure_type,
52
+ diagnostics=diagnostics,
53
+ )
src/collectors/greenhouse.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List
4
+
5
+ import requests
6
+
7
+ from src.collectors.common import dedupe_jobs, normalize_job_posting
8
+ from src.detectors.ats_detector import extract_ats_identifier
9
+ from src.models import CompanyRecord, JobPosting
10
+ from src.resolver.jobs_page_resolver import ResolvedJobsPage
11
+
12
+ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
13
+
14
+
15
+ def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
16
+ """Collect Greenhouse jobs via the public boards API."""
17
+ identifier = company_record.ats_identifier or extract_ats_identifier("greenhouse", resolved_page.url, resolved_page.html)
18
+ if not identifier:
19
+ return []
20
+
21
+ api_url = f"https://boards-api.greenhouse.io/v1/boards/{identifier}/jobs?content=true"
22
+ try:
23
+ response = requests.get(api_url, headers=HEADERS, timeout=20)
24
+ response.raise_for_status()
25
+ payload = response.json()
26
+ except Exception:
27
+ return []
28
+
29
+ jobs = [
30
+ normalize_job_posting(
31
+ company_record,
32
+ title=item.get("title", ""),
33
+ location=((item.get("location") or {}).get("name") or ""),
34
+ job_url=item.get("absolute_url", resolved_page.url),
35
+ description=(item.get("content") or ""),
36
+ department=(item.get("department") or ""),
37
+ source_ats="greenhouse",
38
+ resolved_url=resolved_page.url,
39
+ posted_date=str(item.get("updated_at") or ""),
40
+ raw_payload=item,
41
+ )
42
+ for item in payload.get("jobs", [])
43
+ if item.get("title")
44
+ ]
45
+ return dedupe_jobs(jobs)
src/collectors/lever.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List
4
+
5
+ import requests
6
+
7
+ from src.collectors.common import dedupe_jobs, normalize_job_posting
8
+ from src.detectors.ats_detector import extract_ats_identifier
9
+ from src.models import CompanyRecord, JobPosting
10
+ from src.resolver.jobs_page_resolver import ResolvedJobsPage
11
+
12
+ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
13
+
14
+
15
+ def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
16
+ """Collect Lever jobs via the public postings endpoint."""
17
+ identifier = company_record.ats_identifier or extract_ats_identifier("lever", resolved_page.url, resolved_page.html)
18
+ if not identifier:
19
+ return []
20
+
21
+ api_url = f"https://api.lever.co/v0/postings/{identifier}?mode=json"
22
+ try:
23
+ response = requests.get(api_url, headers=HEADERS, timeout=20)
24
+ response.raise_for_status()
25
+ payload = response.json()
26
+ except Exception:
27
+ return []
28
+
29
+ jobs = []
30
+ for item in payload:
31
+ categories = item.get("categories") or {}
32
+ jobs.append(
33
+ normalize_job_posting(
34
+ company_record,
35
+ title=item.get("text", ""),
36
+ location=categories.get("location", ""),
37
+ job_url=item.get("hostedUrl", resolved_page.url),
38
+ description=item.get("descriptionPlain", ""),
39
+ department=categories.get("team", ""),
40
+ source_ats="lever",
41
+ resolved_url=resolved_page.url,
42
+ employment_type=categories.get("commitment", ""),
43
+ raw_payload=item,
44
+ )
45
+ )
46
+ return dedupe_jobs(jobs)
src/collectors/smartrecruiters.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List
4
+
5
+ import requests
6
+
7
+ from src.collectors.common import dedupe_jobs, normalize_job_posting
8
+ from src.detectors.ats_detector import extract_ats_identifier
9
+ from src.models import CompanyRecord, JobPosting
10
+ from src.resolver.jobs_page_resolver import ResolvedJobsPage
11
+
12
+ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
13
+
14
+
15
+ def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
16
+ """Collect SmartRecruiters jobs when a company identifier is available."""
17
+ identifier = company_record.ats_identifier or extract_ats_identifier("smartrecruiters", resolved_page.url, resolved_page.html)
18
+ if not identifier:
19
+ return []
20
+
21
+ api_url = f"https://api.smartrecruiters.com/v1/companies/{identifier}/postings"
22
+ try:
23
+ response = requests.get(api_url, headers=HEADERS, timeout=20)
24
+ response.raise_for_status()
25
+ payload = response.json()
26
+ except Exception:
27
+ return []
28
+
29
+ jobs = []
30
+ for item in payload.get("content", []):
31
+ jobs.append(
32
+ normalize_job_posting(
33
+ company_record,
34
+ title=item.get("name", ""),
35
+ location=(item.get("location") or {}).get("city", ""),
36
+ job_url=item.get("ref", resolved_page.url),
37
+ description=item.get("jobAd", {}).get("sections", "") if isinstance(item.get("jobAd"), dict) else "",
38
+ department=item.get("department", ""),
39
+ source_ats="smartrecruiters",
40
+ resolved_url=resolved_page.url,
41
+ employment_type=item.get("typeOfEmployment", ""),
42
+ posted_date=str(item.get("releasedDate") or ""),
43
+ raw_payload=item,
44
+ )
45
+ )
46
+ return dedupe_jobs(jobs)
src/collectors/workday.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List
4
+
5
+ from src.collectors.common import dedupe_jobs
6
+ from src.collectors.generic import collect_with_details
7
+ from src.models import CompanyRecord, JobPosting
8
+ from src.resolver.jobs_page_resolver import ResolvedJobsPage
9
+
10
+
11
+ def collect(company_record: CompanyRecord, resolved_page: ResolvedJobsPage) -> List[JobPosting]:
12
+ """Best-effort Workday collection.
13
+
14
+ Workday endpoints vary widely by tenant, so this connector currently relies on the
15
+ resolved HTML plus embedded JSON extraction until a tenant-specific endpoint is configured.
16
+ """
17
+ details = collect_with_details(company_record, resolved_page, source_ats="workday")
18
+ return dedupe_jobs(details.jobs)
src/company_loader.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ from pathlib import Path
5
+ from typing import Iterable, List, Optional, Set
6
+
7
+ from src.models import CompanyRecord
8
+
9
+ COMPANY_KEYS = ["company", "company list", "name"]
10
+ CAREERS_KEYS = ["careers_url", "career url", "jobs_url", "direct links to company career/job openings page"]
11
+
12
+
13
+ def _pick_value(row: dict[str, str], keys: Iterable[str]) -> str:
14
+ for key in keys:
15
+ value = row.get(key, "").strip()
16
+ if value:
17
+ return value
18
+ return ""
19
+
20
+
21
+ def _parse_priority(value: str) -> int:
22
+ try:
23
+ return int(value.strip()) if value.strip() else 0
24
+ except Exception:
25
+ return 0
26
+
27
+
28
+ def load_company_records(
29
+ csv_path: str | Path,
30
+ *,
31
+ limit: Optional[int] = None,
32
+ selected_companies: Optional[Set[str]] = None,
33
+ ) -> List[CompanyRecord]:
34
+ """Load the curated company list used by the targeted job collector."""
35
+ path = Path(csv_path)
36
+ if not path.exists():
37
+ raise FileNotFoundError(f"Company CSV not found: {path}")
38
+
39
+ companies: List[CompanyRecord] = []
40
+ with path.open("r", encoding="utf-8-sig", newline="") as handle:
41
+ reader = csv.DictReader(handle)
42
+ for raw_row in reader:
43
+ row = {str(key).strip().lower(): str(value or "").strip() for key, value in raw_row.items() if key}
44
+ company = _pick_value(row, COMPANY_KEYS)
45
+ careers_url = _pick_value(row, CAREERS_KEYS)
46
+ if not company or not careers_url:
47
+ continue
48
+ if selected_companies and company not in selected_companies:
49
+ continue
50
+
51
+ companies.append(
52
+ CompanyRecord(
53
+ company=company,
54
+ careers_url=careers_url,
55
+ ats_type=row.get("ats_type", ""),
56
+ ats_identifier=row.get("ats_identifier", ""),
57
+ priority=_parse_priority(row.get("priority", "0")),
58
+ source="curated",
59
+ meta=row,
60
+ )
61
+ )
62
+
63
+ return companies[:limit] if limit else companies
src/detectors/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """ATS detection helpers."""
src/detectors/ats_detector.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from urllib.parse import parse_qs, urlparse
5
+
6
+ KNOWN_ATS = {
7
+ "greenhouse": ["greenhouse.io"],
8
+ "lever": ["lever.co", "jobs.lever.co"],
9
+ "workday": ["myworkdayjobs.com", "workday.com"],
10
+ "smartrecruiters": ["smartrecruiters.com"],
11
+ "icims": ["icims.com"],
12
+ "ashby": ["ashbyhq.com", "jobs.ashbyhq.com"],
13
+ "successfactors": ["successfactors.com", "career8.successfactors.com"],
14
+ }
15
+
16
+
17
+ def normalize_ats_type(value: str) -> str:
18
+ lowered = (value or "").strip().lower()
19
+ if lowered in KNOWN_ATS:
20
+ return lowered
21
+ return "generic"
22
+
23
+
24
+ def detect_ats_type(url: str, html: str = "", declared_ats: str = "") -> str:
25
+ """Detect the ATS provider from explicit config, resolved URL, or HTML content."""
26
+ if declared_ats:
27
+ return normalize_ats_type(declared_ats)
28
+
29
+ blob = f"{url} {html}".lower()
30
+ for ats_type, patterns in KNOWN_ATS.items():
31
+ if any(pattern in blob for pattern in patterns):
32
+ return ats_type
33
+
34
+ return "generic"
35
+
36
+
37
+ def extract_ats_identifier(ats_type: str, url: str, html: str = "") -> str:
38
+ """Best-effort extraction of ATS board identifiers for connector API usage."""
39
+ patterns = {
40
+ "greenhouse": r"greenhouse\.io/([^/?#]+)",
41
+ "lever": r"lever\.co/([^/?#]+)",
42
+ "smartrecruiters": r"smartrecruiters\.com/([^/?#]+)",
43
+ "ashby": r"ashbyhq\.com/([^/?#]+)",
44
+ }
45
+ if ats_type in patterns:
46
+ match = re.search(patterns[ats_type], f"{url} {html}")
47
+ return match.group(1) if match else ""
48
+
49
+ if ats_type == "successfactors":
50
+ parsed = urlparse(url)
51
+ query = parse_qs(parsed.query)
52
+ values = query.get("company") or query.get("_s.crb") or []
53
+ return values[0] if values else ""
54
+
55
+ if ats_type == "workday":
56
+ parsed = urlparse(url)
57
+ parts = [part for part in parsed.path.split("/") if part]
58
+ return "/".join(parts[:3]) if parts else ""
59
+
60
+ return ""
src/models.py CHANGED
@@ -1,5 +1,5 @@
1
- from dataclasses import dataclass, field
2
- from typing import Dict, List
3
 
4
 
5
  @dataclass
@@ -19,6 +19,9 @@ class CompanyRecord:
19
  careers_url: str = ""
20
  source: str = "default"
21
  meta: Dict[str, str] = field(default_factory=dict)
 
 
 
22
 
23
 
24
  @dataclass
@@ -30,6 +33,20 @@ class JobPosting:
30
  department: str = ""
31
  description: str = ""
32
  ats: str = "unknown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  @dataclass
 
1
+ from dataclasses import asdict, dataclass, field
2
+ from typing import Any, Dict, List
3
 
4
 
5
  @dataclass
 
19
  careers_url: str = ""
20
  source: str = "default"
21
  meta: Dict[str, str] = field(default_factory=dict)
22
+ ats_type: str = ""
23
+ ats_identifier: str = ""
24
+ priority: int = 0
25
 
26
 
27
  @dataclass
 
33
  department: str = ""
34
  description: str = ""
35
  ats: str = "unknown"
36
+ source_ats: str = "unknown"
37
+ resolved_url: str = ""
38
+ employment_type: str = ""
39
+ posted_date: str = ""
40
+ is_remote: bool = False
41
+ is_entry_level: bool = False
42
+ failure_reason: str = ""
43
+ raw_payload: Dict[str, Any] = field(default_factory=dict)
44
+
45
+ def to_dict(self) -> Dict[str, Any]:
46
+ payload = asdict(self)
47
+ payload["job_url"] = payload.pop("url")
48
+ payload["source_ats"] = payload.get("source_ats") or payload.get("ats", "unknown")
49
+ return payload
50
 
51
 
52
  @dataclass
src/resolver/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Jobs page resolution utilities."""
src/resolver/jobs_page_resolver.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import re
5
+ from dataclasses import dataclass, field
6
+ from typing import Dict, List
7
+ from urllib.parse import urljoin
8
+
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+
12
+ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
13
+ ATS_LINK_PATTERNS = [
14
+ "greenhouse.io",
15
+ "jobs.lever.co",
16
+ "myworkdayjobs.com",
17
+ "smartrecruiters.com",
18
+ "icims.com",
19
+ "ashbyhq.com",
20
+ "successfactors.com",
21
+ ]
22
+ JOB_CTA_HINTS = [
23
+ "jobs",
24
+ "job search",
25
+ "search jobs",
26
+ "view all jobs",
27
+ "open positions",
28
+ "current openings",
29
+ "careers search",
30
+ "all openings",
31
+ "explore jobs",
32
+ ]
33
+
34
+
35
+ @dataclass
36
+ class ResolvedJobsPage:
37
+ requested_url: str
38
+ url: str
39
+ html: str
40
+ fetch_method: str
41
+ final_url: str = ""
42
+ fallback_used: bool = False
43
+ failure_reason: str = ""
44
+ resolution_steps: List[str] = field(default_factory=list)
45
+ html_snapshots: Dict[str, str] = field(default_factory=dict)
46
+
47
+
48
+ def find_ats_link(html: str, base_url: str) -> str:
49
+ """Return the first ATS-hosted link or iframe source found in the page."""
50
+ soup = BeautifulSoup(html or "", "html.parser")
51
+ for tag in soup.select("a[href], iframe[src]"):
52
+ target = tag.get("href") or tag.get("src") or ""
53
+ resolved = urljoin(base_url, target.strip())
54
+ if any(pattern in resolved.lower() for pattern in ATS_LINK_PATTERNS):
55
+ return resolved
56
+ return ""
57
+
58
+
59
+ def find_redirect_url(html: str, base_url: str) -> str:
60
+ """Handle meta refresh and simple JavaScript redirects."""
61
+ soup = BeautifulSoup(html or "", "html.parser")
62
+ meta_refresh = soup.find("meta", attrs={"http-equiv": re.compile(r"refresh", re.I)})
63
+ if meta_refresh:
64
+ content = meta_refresh.get("content", "")
65
+ match = re.search(r"url\s*=\s*([^;]+)", content, re.I)
66
+ if match:
67
+ return urljoin(base_url, match.group(1).strip().strip("\"'"))
68
+
69
+ for pattern in [
70
+ r'location\.href\s*=\s*["\']([^"\']+)["\']',
71
+ r'window\.location\s*=\s*["\']([^"\']+)["\']',
72
+ r'window\.location\.href\s*=\s*["\']([^"\']+)["\']',
73
+ ]:
74
+ match = re.search(pattern, html or "", re.I)
75
+ if match:
76
+ return urljoin(base_url, match.group(1).strip())
77
+ return ""
78
+
79
+
80
+ def find_job_list_url(html: str, base_url: str) -> str:
81
+ """Find a likely jobs listing URL from page CTAs, anchors, buttons, or iframes."""
82
+ soup = BeautifulSoup(html or "", "html.parser")
83
+ for tag in soup.select("a[href], button, iframe[src], [data-href], [formaction]"):
84
+ text = " ".join(tag.get_text(" ", strip=True).split()).lower()
85
+ target = tag.get("href") or tag.get("src") or tag.get("data-href") or tag.get("formaction") or ""
86
+ blob = f"{text} {target}".lower()
87
+ if any(hint in blob for hint in JOB_CTA_HINTS):
88
+ return urljoin(base_url, target.strip()) if target else ""
89
+ return ""
90
+
91
+
92
+ def looks_like_shell_page(html: str) -> bool:
93
+ """Detect shell/search pages that do not yet expose job content."""
94
+ lowered = (html or "").lower()
95
+ if not lowered:
96
+ return True
97
+ shell_markers = ["search jobs", "view jobs", "career search", "keyword", "join our talent community"]
98
+ job_markers = ["apply now", "job id", "req id", "posted", "department"]
99
+ return sum(marker in lowered for marker in shell_markers) >= 2 and sum(marker in lowered for marker in job_markers) <= 1
100
+
101
+
102
+ def looks_js_heavy(html: str) -> bool:
103
+ """Detect pages that likely require browser execution."""
104
+ lowered = (html or "").lower()
105
+ if len(lowered) < 1500:
106
+ return True
107
+ return any(marker in lowered for marker in ["enable javascript", "loading", "__next", "app-root", "hydration"])
108
+
109
+
110
+ def _request_page(url: str, timeout: int = 12) -> ResolvedJobsPage:
111
+ response = requests.get(url, headers=HEADERS, timeout=timeout)
112
+ response.raise_for_status()
113
+ resolved_url = response.url or url
114
+ return ResolvedJobsPage(
115
+ requested_url=url,
116
+ url=resolved_url,
117
+ final_url=resolved_url,
118
+ html=response.text,
119
+ fetch_method="requests",
120
+ resolution_steps=[f"requests:{url} -> {resolved_url}"],
121
+ html_snapshots={"requests": response.text},
122
+ )
123
+
124
+
125
+ def _playwright_resolve(url: str) -> ResolvedJobsPage:
126
+ try:
127
+ sync_api = importlib.import_module("playwright.sync_api")
128
+ sync_playwright = getattr(sync_api, "sync_playwright")
129
+
130
+ with sync_playwright() as playwright:
131
+ browser = playwright.chromium.launch(headless=True)
132
+ page = browser.new_page()
133
+ page.goto(url, wait_until="domcontentloaded", timeout=20000)
134
+
135
+ try:
136
+ page.wait_for_load_state("networkidle", timeout=5000)
137
+ except Exception:
138
+ pass
139
+
140
+ for cta in ["View Jobs", "Search Jobs", "Open Positions", "Current Openings", "Explore Jobs", "See All Jobs"]:
141
+ try:
142
+ page.get_by_text(cta, exact=False).first.click(timeout=1500)
143
+ page.wait_for_timeout(800)
144
+ break
145
+ except Exception:
146
+ continue
147
+
148
+ for _ in range(3):
149
+ page.mouse.wheel(0, 1800)
150
+ page.wait_for_timeout(400)
151
+
152
+ html = page.content()
153
+ final_url = page.url
154
+ browser.close()
155
+ except Exception:
156
+ return ResolvedJobsPage(requested_url=url, url=url, final_url=url, html="", fetch_method="playwright", fallback_used=True, failure_reason="REQUEST_FAILED")
157
+
158
+ return ResolvedJobsPage(
159
+ requested_url=url,
160
+ url=final_url,
161
+ final_url=final_url,
162
+ html=html,
163
+ fetch_method="playwright",
164
+ fallback_used=True,
165
+ resolution_steps=[f"playwright:{url} -> {final_url}"],
166
+ html_snapshots={"playwright": html},
167
+ )
168
+
169
+
170
+ def resolve_real_jobs_page(careers_url: str) -> ResolvedJobsPage:
171
+ """Resolve the company careers landing page to the real jobs page before collection."""
172
+ if not careers_url:
173
+ return ResolvedJobsPage(requested_url="", url="", final_url="", html="", fetch_method="none", failure_reason="REQUEST_FAILED")
174
+
175
+ visited = set()
176
+ current_url = careers_url
177
+ steps: List[str] = []
178
+ snapshots: Dict[str, str] = {}
179
+ last_page = ResolvedJobsPage(requested_url=careers_url, url=careers_url, final_url=careers_url, html="", fetch_method="requests")
180
+
181
+ for step in range(4):
182
+ if current_url in visited:
183
+ break
184
+ visited.add(current_url)
185
+
186
+ try:
187
+ last_page = _request_page(current_url)
188
+ steps.extend(last_page.resolution_steps)
189
+ snapshots[f"step{step + 1}_requests"] = last_page.html
190
+ except Exception:
191
+ fallback = _playwright_resolve(current_url)
192
+ fallback.failure_reason = fallback.failure_reason or "REQUEST_FAILED"
193
+ fallback.resolution_steps = steps + fallback.resolution_steps
194
+ fallback.html_snapshots.update(snapshots)
195
+ return fallback
196
+
197
+ next_url = find_redirect_url(last_page.html, last_page.url) or find_ats_link(last_page.html, last_page.url) or find_job_list_url(last_page.html, last_page.url)
198
+ if next_url and next_url not in visited:
199
+ steps.append(f"discovered:{last_page.url} -> {next_url}")
200
+ current_url = next_url
201
+ continue
202
+
203
+ if looks_like_shell_page(last_page.html) or looks_js_heavy(last_page.html):
204
+ fallback = _playwright_resolve(last_page.url)
205
+ fallback.failure_reason = fallback.failure_reason or "JS_PAGE"
206
+ fallback.resolution_steps = steps + fallback.resolution_steps
207
+ fallback.html_snapshots.update(snapshots)
208
+ return fallback
209
+
210
+ break
211
+
212
+ last_page.requested_url = careers_url
213
+ last_page.resolution_steps = steps
214
+ last_page.html_snapshots.update(snapshots)
215
+ if looks_like_shell_page(last_page.html):
216
+ last_page.failure_reason = "SHELL_PAGE"
217
+ return last_page