Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from typing import Iterable, List | |
| from src.models import CompanyRecord, JobPosting | |
| ENTRY_LEVEL_SIGNALS = [ | |
| "new grad", | |
| "university", | |
| "early career", | |
| "associate", | |
| "entry level", | |
| "campus", | |
| "intern", | |
| "internship", | |
| ] | |
| SENIOR_SIGNALS = ["senior", "staff", "principal", "manager", "director"] | |
| REMOTE_SIGNALS = ["remote", "hybrid remote", "work from home"] | |
| def is_remote_role(*texts: str) -> bool: | |
| blob = " ".join(texts).lower() | |
| return any(signal in blob for signal in REMOTE_SIGNALS) | |
| def is_entry_level_role(*texts: str) -> bool: | |
| blob = " ".join(texts).lower() | |
| if any(signal in blob for signal in SENIOR_SIGNALS): | |
| return False | |
| return any(signal in blob for signal in ENTRY_LEVEL_SIGNALS) | |
| def normalize_job_posting( | |
| company_record: CompanyRecord, | |
| *, | |
| title: str, | |
| location: str, | |
| job_url: str, | |
| description: str, | |
| department: str = "", | |
| source_ats: str = "generic", | |
| resolved_url: str = "", | |
| employment_type: str = "", | |
| posted_date: str = "", | |
| raw_payload: dict | None = None, | |
| ) -> JobPosting: | |
| """Normalize connector output into the shared JobPosting schema.""" | |
| return JobPosting( | |
| company=company_record.company, | |
| title=title.strip()[:160], | |
| location=location.strip()[:160], | |
| url=job_url.strip(), | |
| department=department.strip()[:160], | |
| description=description.strip()[:4000], | |
| ats=source_ats, | |
| source_ats=source_ats, | |
| resolved_url=resolved_url or company_record.careers_url, | |
| employment_type=employment_type.strip()[:80], | |
| posted_date=posted_date.strip()[:80], | |
| is_remote=is_remote_role(title, location, description), | |
| is_entry_level=is_entry_level_role(title, description), | |
| raw_payload=raw_payload or {}, | |
| ) | |
| def dedupe_jobs(jobs: Iterable[JobPosting]) -> List[JobPosting]: | |
| """Deduplicate jobs by normalized title and URL.""" | |
| seen = set() | |
| deduped: List[JobPosting] = [] | |
| for job in jobs: | |
| key = (job.title.strip().lower(), job.url.strip().lower()) | |
| if key in seen: | |
| continue | |
| seen.add(key) | |
| deduped.append(job) | |
| return deduped | |