Spaces:

cherrykiwidd
/

nsbecf

Sleeping

acarey5

new scrapping

851ce09 29 days ago

2.23 kB

	from __future__ import annotations

	from typing import Iterable, List

	from src.models import CompanyRecord, JobPosting

	ENTRY_LEVEL_SIGNALS = [
	"new grad",
	"university",
	"early career",
	"associate",
	"entry level",
	"campus",
	"intern",
	"internship",
	]

	SENIOR_SIGNALS = ["senior", "staff", "principal", "manager", "director"]

	REMOTE_SIGNALS = ["remote", "hybrid remote", "work from home"]


	def is_remote_role(*texts: str) -> bool:
	blob = " ".join(texts).lower()
	return any(signal in blob for signal in REMOTE_SIGNALS)


	def is_entry_level_role(*texts: str) -> bool:
	blob = " ".join(texts).lower()
	if any(signal in blob for signal in SENIOR_SIGNALS):
	return False
	return any(signal in blob for signal in ENTRY_LEVEL_SIGNALS)


	def normalize_job_posting(
	company_record: CompanyRecord,
	*,
	title: str,
	location: str,
	job_url: str,
	description: str,
	department: str = "",
	source_ats: str = "generic",
	resolved_url: str = "",
	employment_type: str = "",
	posted_date: str = "",
	raw_payload: dict \| None = None,
	) -> JobPosting:
	"""Normalize connector output into the shared JobPosting schema."""
	return JobPosting(
	company=company_record.company,
	title=title.strip()[:160],
	location=location.strip()[:160],
	url=job_url.strip(),
	department=department.strip()[:160],
	description=description.strip()[:4000],
	ats=source_ats,
	source_ats=source_ats,
	resolved_url=resolved_url or company_record.careers_url,
	employment_type=employment_type.strip()[:80],
	posted_date=posted_date.strip()[:80],
	is_remote=is_remote_role(title, location, description),
	is_entry_level=is_entry_level_role(title, description),
	raw_payload=raw_payload or {},
	)


	def dedupe_jobs(jobs: Iterable[JobPosting]) -> List[JobPosting]:
	"""Deduplicate jobs by normalized title and URL."""
	seen = set()
	deduped: List[JobPosting] = []
	for job in jobs:
	key = (job.title.strip().lower(), job.url.strip().lower())
	if key in seen:
	continue
	seen.add(key)
	deduped.append(job)
	return deduped