Spaces:
Running
Running
| """μ¬λμΈ ν¬λ‘€λ¬ β robots.txt Disallow μ€μ, μμΈ νμ΄μ§ λ―Έμμ§.""" | |
| import logging | |
| import re | |
| from datetime import date, datetime | |
| from urllib.parse import urlencode | |
| from bs4 import BeautifulSoup | |
| from .base import BaseCrawler, JobItem, normalize_skill, extract_skills_from_text | |
| logger = logging.getLogger(__name__) | |
| # μ¬λμΈ μ§κ΅° κ²μ ν€μλ | |
| SARAMIN_KEYWORDS: dict[str, str] = { | |
| "λ°μ΄ν° μμ§λμ΄": "λ°μ΄ν°μμ§λμ΄", | |
| "λ°μ΄ν° λΆμκ°": "λ°μ΄ν°λΆμκ°", | |
| "λ°μ΄ν° μ¬μ΄μΈν°μ€νΈ": "λ°μ΄ν°μ¬μ΄μΈν°μ€νΈ", | |
| "ML μμ§λμ΄": "λ¨Έμ λ¬λμμ§λμ΄", | |
| } | |
| SARAMIN_BASE = "https://www.saramin.co.kr" | |
| SARAMIN_LIST = f"{SARAMIN_BASE}/zf_user/search/recruit" | |
| def _parse_salary_saramin(text: str | None) -> tuple[int | None, int | None]: | |
| """'3500~5000λ§μ' λλ 'λ©΄μ ν κ²°μ ' νμ±.""" | |
| if not text: | |
| return None, None | |
| nums = re.findall(r"\d{3,5}", text.replace(",", "")) | |
| nums = [int(n) for n in nums] | |
| if len(nums) >= 2: | |
| return nums[0], nums[1] | |
| if len(nums) == 1: | |
| return nums[0], None | |
| return None, None | |
| class SaraminCrawler(BaseCrawler): | |
| SITE_NAME = "saramin" | |
| MIN_DELAY = 2.0 | |
| MAX_DELAY = 4.0 | |
| def crawl(self, category: str, max_pages: int = 10) -> list[JobItem]: | |
| keyword = SARAMIN_KEYWORDS.get(category) | |
| if keyword is None: | |
| raise ValueError(f"Unknown category: {category}") | |
| jobs: list[JobItem] = [] | |
| for page in range(1, max_pages + 1): | |
| logger.info(f"[saramin] {category} β page {page}") | |
| params = { | |
| "searchType": "search", | |
| "searchword": keyword, | |
| "recruitPage": page, | |
| "recruitPageCount": 40, | |
| "order": "reg_dt", # μ΅μ μ | |
| "recruitSort": "reg_dt", | |
| "jobtype": 1, # μ κ·μ§ μμ£Ό | |
| } | |
| try: | |
| url = f"{SARAMIN_LIST}?{urlencode(params)}" | |
| soup = self._soup(url) | |
| except Exception as e: | |
| logger.error(f"[saramin] λͺ©λ‘ νμ΄μ§ μ€λ₯ (page={page}): {e}") | |
| break | |
| items = soup.select(".item_recruit") | |
| if not items: | |
| logger.info(f"[saramin] λ μ΄μ κ³΅κ³ μμ (page={page})") | |
| break | |
| for item in items: | |
| job = self._parse_item(item, category) | |
| if job: | |
| jobs.append(job) | |
| logger.info(f"[saramin] {category} μ΄ {len(jobs)}건 μμ§ μλ£") | |
| return jobs | |
| def crawl_all_categories(self, max_pages: int = 10) -> list[JobItem]: | |
| all_jobs: list[JobItem] = [] | |
| for category in SARAMIN_KEYWORDS: | |
| all_jobs.extend(self.crawl(category, max_pages=max_pages)) | |
| return all_jobs | |
| def _parse_item(self, item: BeautifulSoup, category: str) -> JobItem | None: | |
| try: | |
| # IDλ value μμ±μ μμ | |
| job_id = item.get("value", "") | |
| if not job_id: | |
| return None | |
| title_el = item.select_one(".job_tit a") | |
| title = title_el.get("title", "").strip() if title_el else "" | |
| company_el = item.select_one(".corp_name a") | |
| company = company_el.get_text(strip=True) if company_el else "" | |
| # span μμ: [μ§μ, κ²½λ ₯, νλ ₯, κ³ μ©νν] | |
| conditions = item.select(".job_condition span") | |
| location = conditions[0].get_text(strip=True) if len(conditions) > 0 else None | |
| exp_str = conditions[1].get_text(strip=True) if len(conditions) > 1 else None | |
| exp_min, exp_max = self._parse_experience(exp_str) | |
| emp_raw = conditions[3].get_text(strip=True) if len(conditions) > 3 else None | |
| employment_type = self._normalize_employment(emp_raw) | |
| salary_el = item.select_one(".salary") | |
| sal_min, sal_max = _parse_salary_saramin( | |
| salary_el.get_text(strip=True) if salary_el else None | |
| ) | |
| deadline_el = item.select_one(".job_date .date") | |
| deadline_date = self._parse_deadline( | |
| deadline_el.get_text(strip=True) if deadline_el else None | |
| ) | |
| # μ¬λμΈ: sector νκ·Έ + μ λͺ© + μ§λ¬΄νκ·Έ μ λΆ ν©μ³μ μΆμΆ | |
| skill_parts = [title] | |
| skill_area = item.select_one(".job_sector") | |
| if skill_area: | |
| skill_parts.append(skill_area.get_text(" ", strip=True)) | |
| # μΆκ° νκ·Έ μμ | |
| for tag_el in item.select(".job_tag, .tag_wrap, [class*='tag']"): | |
| skill_parts.append(tag_el.get_text(" ", strip=True)) | |
| skill_text = " ".join(skill_parts) | |
| skills = [normalize_skill(s) for s in extract_skills_from_text(skill_text)] | |
| # μ μ’ : .corp_sector λλ company_info μΉμ μμ νμ± | |
| industry = None | |
| corp_sector = item.select_one(".corp_sector") or item.select_one("[class*='industry']") | |
| if corp_sector: | |
| industry = corp_sector.get_text(strip=True) or None | |
| # μμΈ νμ΄μ§λ robots.txt Disallow β relay URL μ¬μ© | |
| url = f"{SARAMIN_BASE}/zf_user/jobs/relay/view?rec_idx={job_id}" | |
| return JobItem( | |
| source_site="saramin", | |
| source_id=str(job_id), | |
| url=url, | |
| title=title, | |
| company_name=company, | |
| job_category=category, | |
| industry=industry, | |
| employment_type=employment_type, | |
| skills=skills, | |
| location=location, | |
| experience_min=exp_min, | |
| experience_max=exp_max, | |
| salary_min=sal_min, | |
| salary_max=sal_max, | |
| deadline_date=deadline_date, | |
| ) | |
| except Exception as e: | |
| logger.warning(f"[saramin] νμ± μ€ν¨: {e}") | |
| return None | |
| def _normalize_employment(text: str | None) -> str | None: | |
| if not text: | |
| return None | |
| t = text.strip() | |
| if "μ κ·" in t: | |
| return "μ κ·μ§" | |
| if "κ³μ½" in t or "κΈ°κ°μ " in t: | |
| return "κ³μ½μ§" | |
| if "μΈν΄" in t: | |
| return "μΈν΄" | |
| if "μλ₯΄λ°μ΄νΈ" in t or "ννΈ" in t: | |
| return "μλ₯΄λ°μ΄νΈ" | |
| if "ν리λμ" in t: | |
| return "ν리λμ" | |
| return t or None | |
| def _parse_experience(exp_str: str | None) -> tuple[int | None, int | None]: | |
| if not exp_str: | |
| return None, None | |
| if "μ μ " in exp_str and "κ²½λ ₯" not in exp_str: | |
| return 0, 0 | |
| if "κ²½λ ₯무κ΄" in exp_str or "무κ΄" in exp_str: | |
| return None, None | |
| nums = re.findall(r"\d+", exp_str) | |
| nums = [int(n) for n in nums] | |
| if len(nums) >= 2: | |
| return nums[0], nums[1] | |
| if len(nums) == 1: | |
| return nums[0], None | |
| return None, None | |
| def _parse_deadline(deadline_str: str | None) -> date | None: | |
| if not deadline_str: | |
| return None | |
| if "μμ" in deadline_str or "μ±μ©" in deadline_str: | |
| return None | |
| try: | |
| return datetime.strptime(deadline_str, "%y.%m.%d").date() | |
| except ValueError: | |
| return None | |