jobsonar / crawler /saramin.py
MiniMing
feat: expand SKILL_ALIASES to 120+ entries, widen skill text extraction
0459e0c
"""μ‚¬λžŒμΈ 크둀러 β€” robots.txt Disallow μ€€μˆ˜, 상세 νŽ˜μ΄μ§€ λ―Έμˆ˜μ§‘."""
import logging
import re
from datetime import date, datetime
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from .base import BaseCrawler, JobItem, normalize_skill, extract_skills_from_text
logger = logging.getLogger(__name__)
# μ‚¬λžŒμΈ 직ꡰ 검색 ν‚€μ›Œλ“œ
SARAMIN_KEYWORDS: dict[str, str] = {
"데이터 μ—”μ§€λ‹ˆμ–΄": "λ°μ΄ν„°μ—”μ§€λ‹ˆμ–΄",
"데이터 뢄석가": "데이터뢄석가",
"데이터 μ‚¬μ΄μ–Έν‹°μŠ€νŠΈ": "λ°μ΄ν„°μ‚¬μ΄μ–Έν‹°μŠ€νŠΈ",
"ML μ—”μ§€λ‹ˆμ–΄": "λ¨Έμ‹ λŸ¬λ‹μ—”μ§€λ‹ˆμ–΄",
}
SARAMIN_BASE = "https://www.saramin.co.kr"
SARAMIN_LIST = f"{SARAMIN_BASE}/zf_user/search/recruit"
def _parse_salary_saramin(text: str | None) -> tuple[int | None, int | None]:
"""'3500~5000λ§Œμ›' λ˜λŠ” 'λ©΄μ ‘ ν›„ κ²°μ •' νŒŒμ‹±."""
if not text:
return None, None
nums = re.findall(r"\d{3,5}", text.replace(",", ""))
nums = [int(n) for n in nums]
if len(nums) >= 2:
return nums[0], nums[1]
if len(nums) == 1:
return nums[0], None
return None, None
class SaraminCrawler(BaseCrawler):
SITE_NAME = "saramin"
MIN_DELAY = 2.0
MAX_DELAY = 4.0
def crawl(self, category: str, max_pages: int = 10) -> list[JobItem]:
keyword = SARAMIN_KEYWORDS.get(category)
if keyword is None:
raise ValueError(f"Unknown category: {category}")
jobs: list[JobItem] = []
for page in range(1, max_pages + 1):
logger.info(f"[saramin] {category} β€” page {page}")
params = {
"searchType": "search",
"searchword": keyword,
"recruitPage": page,
"recruitPageCount": 40,
"order": "reg_dt", # μ΅œμ‹ μˆœ
"recruitSort": "reg_dt",
"jobtype": 1, # μ •κ·œμ§ μœ„μ£Ό
}
try:
url = f"{SARAMIN_LIST}?{urlencode(params)}"
soup = self._soup(url)
except Exception as e:
logger.error(f"[saramin] λͺ©λ‘ νŽ˜μ΄μ§€ 였λ₯˜ (page={page}): {e}")
break
items = soup.select(".item_recruit")
if not items:
logger.info(f"[saramin] 더 이상 곡고 μ—†μŒ (page={page})")
break
for item in items:
job = self._parse_item(item, category)
if job:
jobs.append(job)
logger.info(f"[saramin] {category} 총 {len(jobs)}건 μˆ˜μ§‘ μ™„λ£Œ")
return jobs
def crawl_all_categories(self, max_pages: int = 10) -> list[JobItem]:
all_jobs: list[JobItem] = []
for category in SARAMIN_KEYWORDS:
all_jobs.extend(self.crawl(category, max_pages=max_pages))
return all_jobs
def _parse_item(self, item: BeautifulSoup, category: str) -> JobItem | None:
try:
# IDλŠ” value 속성에 있음
job_id = item.get("value", "")
if not job_id:
return None
title_el = item.select_one(".job_tit a")
title = title_el.get("title", "").strip() if title_el else ""
company_el = item.select_one(".corp_name a")
company = company_el.get_text(strip=True) if company_el else ""
# span μˆœμ„œ: [μ§€μ—­, κ²½λ ₯, ν•™λ ₯, κ³ μš©ν˜•νƒœ]
conditions = item.select(".job_condition span")
location = conditions[0].get_text(strip=True) if len(conditions) > 0 else None
exp_str = conditions[1].get_text(strip=True) if len(conditions) > 1 else None
exp_min, exp_max = self._parse_experience(exp_str)
emp_raw = conditions[3].get_text(strip=True) if len(conditions) > 3 else None
employment_type = self._normalize_employment(emp_raw)
salary_el = item.select_one(".salary")
sal_min, sal_max = _parse_salary_saramin(
salary_el.get_text(strip=True) if salary_el else None
)
deadline_el = item.select_one(".job_date .date")
deadline_date = self._parse_deadline(
deadline_el.get_text(strip=True) if deadline_el else None
)
# μ‚¬λžŒμΈ: sector νƒœκ·Έ + 제λͺ© + μ§λ¬΄νƒœκ·Έ μ „λΆ€ ν•©μ³μ„œ μΆ”μΆœ
skill_parts = [title]
skill_area = item.select_one(".job_sector")
if skill_area:
skill_parts.append(skill_area.get_text(" ", strip=True))
# μΆ”κ°€ νƒœκ·Έ μ˜μ—­
for tag_el in item.select(".job_tag, .tag_wrap, [class*='tag']"):
skill_parts.append(tag_el.get_text(" ", strip=True))
skill_text = " ".join(skill_parts)
skills = [normalize_skill(s) for s in extract_skills_from_text(skill_text)]
# μ—…μ’…: .corp_sector λ˜λŠ” company_info μ„Ήμ…˜μ—μ„œ νŒŒμ‹±
industry = None
corp_sector = item.select_one(".corp_sector") or item.select_one("[class*='industry']")
if corp_sector:
industry = corp_sector.get_text(strip=True) or None
# 상세 νŽ˜μ΄μ§€λŠ” robots.txt Disallow β†’ relay URL μ‚¬μš©
url = f"{SARAMIN_BASE}/zf_user/jobs/relay/view?rec_idx={job_id}"
return JobItem(
source_site="saramin",
source_id=str(job_id),
url=url,
title=title,
company_name=company,
job_category=category,
industry=industry,
employment_type=employment_type,
skills=skills,
location=location,
experience_min=exp_min,
experience_max=exp_max,
salary_min=sal_min,
salary_max=sal_max,
deadline_date=deadline_date,
)
except Exception as e:
logger.warning(f"[saramin] νŒŒμ‹± μ‹€νŒ¨: {e}")
return None
@staticmethod
def _normalize_employment(text: str | None) -> str | None:
if not text:
return None
t = text.strip()
if "μ •κ·œ" in t:
return "μ •κ·œμ§"
if "계약" in t or "κΈ°κ°„μ œ" in t:
return "계약직"
if "인턴" in t:
return "인턴"
if "μ•„λ₯΄λ°”μ΄νŠΈ" in t or "파트" in t:
return "μ•„λ₯΄λ°”μ΄νŠΈ"
if "ν”„λ¦¬λžœμ„œ" in t:
return "ν”„λ¦¬λžœμ„œ"
return t or None
@staticmethod
def _parse_experience(exp_str: str | None) -> tuple[int | None, int | None]:
if not exp_str:
return None, None
if "μ‹ μž…" in exp_str and "κ²½λ ₯" not in exp_str:
return 0, 0
if "κ²½λ ₯무관" in exp_str or "무관" in exp_str:
return None, None
nums = re.findall(r"\d+", exp_str)
nums = [int(n) for n in nums]
if len(nums) >= 2:
return nums[0], nums[1]
if len(nums) == 1:
return nums[0], None
return None, None
@staticmethod
def _parse_deadline(deadline_str: str | None) -> date | None:
if not deadline_str:
return None
if "μƒμ‹œ" in deadline_str or "μ±„μš©" in deadline_str:
return None
try:
return datetime.strptime(deadline_str, "%y.%m.%d").date()
except ValueError:
return None