nsbecf / src /collectors /common.py
acarey5
new scrapping
851ce09
from __future__ import annotations
from typing import Iterable, List
from src.models import CompanyRecord, JobPosting
ENTRY_LEVEL_SIGNALS = [
"new grad",
"university",
"early career",
"associate",
"entry level",
"campus",
"intern",
"internship",
]
SENIOR_SIGNALS = ["senior", "staff", "principal", "manager", "director"]
REMOTE_SIGNALS = ["remote", "hybrid remote", "work from home"]
def is_remote_role(*texts: str) -> bool:
blob = " ".join(texts).lower()
return any(signal in blob for signal in REMOTE_SIGNALS)
def is_entry_level_role(*texts: str) -> bool:
blob = " ".join(texts).lower()
if any(signal in blob for signal in SENIOR_SIGNALS):
return False
return any(signal in blob for signal in ENTRY_LEVEL_SIGNALS)
def normalize_job_posting(
company_record: CompanyRecord,
*,
title: str,
location: str,
job_url: str,
description: str,
department: str = "",
source_ats: str = "generic",
resolved_url: str = "",
employment_type: str = "",
posted_date: str = "",
raw_payload: dict | None = None,
) -> JobPosting:
"""Normalize connector output into the shared JobPosting schema."""
return JobPosting(
company=company_record.company,
title=title.strip()[:160],
location=location.strip()[:160],
url=job_url.strip(),
department=department.strip()[:160],
description=description.strip()[:4000],
ats=source_ats,
source_ats=source_ats,
resolved_url=resolved_url or company_record.careers_url,
employment_type=employment_type.strip()[:80],
posted_date=posted_date.strip()[:80],
is_remote=is_remote_role(title, location, description),
is_entry_level=is_entry_level_role(title, description),
raw_payload=raw_payload or {},
)
def dedupe_jobs(jobs: Iterable[JobPosting]) -> List[JobPosting]:
"""Deduplicate jobs by normalized title and URL."""
seen = set()
deduped: List[JobPosting] = []
for job in jobs:
key = (job.title.strip().lower(), job.url.strip().lower())
if key in seen:
continue
seen.add(key)
deduped.append(job)
return deduped